• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/image_ops.cc
17 
18 #define EIGEN_USE_THREADS
19 
20 #include "tensorflow/core/kernels/image/non_max_suppression_op.h"
21 
22 #include <cmath>
23 #include <functional>
24 #include <queue>
25 #include <vector>
26 
27 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
28 #include "tensorflow/core/framework/bounds_check.h"
29 #include "tensorflow/core/framework/op_kernel.h"
30 #include "tensorflow/core/framework/register_types.h"
31 #include "tensorflow/core/framework/tensor.h"
32 #include "tensorflow/core/framework/tensor_shape.h"
33 #include "tensorflow/core/framework/types.h"
34 #include "tensorflow/core/lib/core/status.h"
35 #include "tensorflow/core/platform/logging.h"
36 
37 namespace tensorflow {
38 namespace {
39 
40 typedef Eigen::ThreadPoolDevice CPUDevice;
41 
CheckScoreSizes(OpKernelContext * context,int num_boxes,const Tensor & scores)42 static inline void CheckScoreSizes(OpKernelContext* context, int num_boxes,
43                                    const Tensor& scores) {
44   // The shape of 'scores' is [num_boxes]
45   OP_REQUIRES(context, scores.dims() == 1,
46               errors::InvalidArgument(
47                   "scores must be 1-D", scores.shape().DebugString(),
48                   " (Shape must be rank 1 but is rank ", scores.dims(), ")"));
49   OP_REQUIRES(
50       context, scores.dim_size(0) == num_boxes,
51       errors::InvalidArgument("scores has incompatible shape (Dimensions must "
52                               "be equal, but are ",
53                               num_boxes, " and ", scores.dim_size(0), ")"));
54 }
55 
ParseAndCheckOverlapSizes(OpKernelContext * context,const Tensor & overlaps,int * num_boxes)56 static inline void ParseAndCheckOverlapSizes(OpKernelContext* context,
57                                              const Tensor& overlaps,
58                                              int* num_boxes) {
59   // the shape of 'overlaps' is [num_boxes, num_boxes]
60   OP_REQUIRES(context, overlaps.dims() == 2,
61               errors::InvalidArgument("overlaps must be 2-D",
62                                       overlaps.shape().DebugString()));
63 
64   *num_boxes = overlaps.dim_size(0);
65   OP_REQUIRES(context, overlaps.dim_size(1) == *num_boxes,
66               errors::InvalidArgument("overlaps must be square",
67                                       overlaps.shape().DebugString()));
68 }
69 
ParseAndCheckBoxSizes(OpKernelContext * context,const Tensor & boxes,int * num_boxes)70 static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
71                                          const Tensor& boxes, int* num_boxes) {
72   // The shape of 'boxes' is [num_boxes, 4]
73   OP_REQUIRES(context, boxes.dims() == 2,
74               errors::InvalidArgument(
75                   "boxes must be 2-D", boxes.shape().DebugString(),
76                   " (Shape must be rank 2 but is rank ", boxes.dims(), ")"));
77   *num_boxes = boxes.dim_size(0);
78   OP_REQUIRES(context, boxes.dim_size(1) == 4,
79               errors::InvalidArgument("boxes must have 4 columns (Dimension "
80                                       "must be 4 but is ",
81                                       boxes.dim_size(1), ")"));
82 }
83 
CheckCombinedNMSScoreSizes(OpKernelContext * context,int num_boxes,const Tensor & scores)84 static inline void CheckCombinedNMSScoreSizes(OpKernelContext* context,
85                                               int num_boxes,
86                                               const Tensor& scores) {
87   // The shape of 'scores' is [batch_size, num_boxes, num_classes]
88   OP_REQUIRES(context, scores.dims() == 3,
89               errors::InvalidArgument("scores must be 3-D",
90                                       scores.shape().DebugString()));
91   OP_REQUIRES(context, scores.dim_size(1) == num_boxes,
92               errors::InvalidArgument("scores has incompatible shape"));
93 }
94 
ParseAndCheckCombinedNMSBoxSizes(OpKernelContext * context,const Tensor & boxes,int * num_boxes,const int num_classes)95 static inline void ParseAndCheckCombinedNMSBoxSizes(OpKernelContext* context,
96                                                     const Tensor& boxes,
97                                                     int* num_boxes,
98                                                     const int num_classes) {
99   // The shape of 'boxes' is [batch_size, num_boxes, q, 4]
100   OP_REQUIRES(context, boxes.dims() == 4,
101               errors::InvalidArgument("boxes must be 4-D",
102                                       boxes.shape().DebugString()));
103 
104   bool box_check = boxes.dim_size(2) == 1 || boxes.dim_size(2) == num_classes;
105   OP_REQUIRES(context, box_check,
106               errors::InvalidArgument(
107                   "third dimension of boxes must be either 1 or num classes"));
108   *num_boxes = boxes.dim_size(1);
109   OP_REQUIRES(context, boxes.dim_size(3) == 4,
110               errors::InvalidArgument("boxes must have 4 columns"));
111 }
112 // Return intersection-over-union overlap between boxes i and j
113 template <typename T>
IOU(typename TTypes<T,2>::ConstTensor boxes,int i,int j)114 static inline T IOU(typename TTypes<T, 2>::ConstTensor boxes, int i, int j) {
115   const T ymin_i = std::min<T>(boxes(i, 0), boxes(i, 2));
116   const T xmin_i = std::min<T>(boxes(i, 1), boxes(i, 3));
117   const T ymax_i = std::max<T>(boxes(i, 0), boxes(i, 2));
118   const T xmax_i = std::max<T>(boxes(i, 1), boxes(i, 3));
119   const T ymin_j = std::min<T>(boxes(j, 0), boxes(j, 2));
120   const T xmin_j = std::min<T>(boxes(j, 1), boxes(j, 3));
121   const T ymax_j = std::max<T>(boxes(j, 0), boxes(j, 2));
122   const T xmax_j = std::max<T>(boxes(j, 1), boxes(j, 3));
123   const T area_i = (ymax_i - ymin_i) * (xmax_i - xmin_i);
124   const T area_j = (ymax_j - ymin_j) * (xmax_j - xmin_j);
125   if (area_i <= static_cast<T>(0) || area_j <= static_cast<T>(0)) {
126     return static_cast<T>(0.0);
127   }
128   const T intersection_ymin = std::max<T>(ymin_i, ymin_j);
129   const T intersection_xmin = std::max<T>(xmin_i, xmin_j);
130   const T intersection_ymax = std::min<T>(ymax_i, ymax_j);
131   const T intersection_xmax = std::min<T>(xmax_i, xmax_j);
132   const T intersection_area =
133       std::max<T>(intersection_ymax - intersection_ymin, static_cast<T>(0.0)) *
134       std::max<T>(intersection_xmax - intersection_xmin, static_cast<T>(0.0));
135   return intersection_area / (area_i + area_j - intersection_area);
136 }
137 
138 template <typename T>
Overlap(typename TTypes<T,2>::ConstTensor overlaps,int i,int j)139 static inline T Overlap(typename TTypes<T, 2>::ConstTensor overlaps, int i,
140                         int j) {
141   return overlaps(i, j);
142 }
143 
144 template <typename T>
CreateIOUSimilarityFn(const Tensor & boxes)145 static inline std::function<T(int, int)> CreateIOUSimilarityFn(
146     const Tensor& boxes) {
147   typename TTypes<T, 2>::ConstTensor boxes_data = boxes.tensor<T, 2>();
148   return std::bind(&IOU<T>, boxes_data, std::placeholders::_1,
149                    std::placeholders::_2);
150 }
151 
152 template <typename T>
CreateOverlapSimilarityFn(const Tensor & overlaps)153 static inline std::function<T(int, int)> CreateOverlapSimilarityFn(
154     const Tensor& overlaps) {
155   typename TTypes<T, 2>::ConstTensor overlaps_data =
156       overlaps.tensor<float, 2>();
157   return std::bind(&Overlap<T>, overlaps_data, std::placeholders::_1,
158                    std::placeholders::_2);
159 }
160 
161 template <typename T>
DoNonMaxSuppressionOp(OpKernelContext * context,const Tensor & scores,int num_boxes,const Tensor & max_output_size,const T similarity_threshold,const T score_threshold,const T soft_nms_sigma,const std::function<T (int,int)> & similarity_fn,bool return_scores_tensor=false,bool pad_to_max_output_size=false,int * ptr_num_valid_outputs=nullptr)162 void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& scores,
163                            int num_boxes, const Tensor& max_output_size,
164                            const T similarity_threshold,
165                            const T score_threshold, const T soft_nms_sigma,
166                            const std::function<T(int, int)>& similarity_fn,
167                            bool return_scores_tensor = false,
168                            bool pad_to_max_output_size = false,
169                            int* ptr_num_valid_outputs = nullptr) {
170   const int output_size = max_output_size.scalar<int>()();
171 
172   std::vector<T> scores_data(num_boxes);
173   std::copy_n(scores.flat<T>().data(), num_boxes, scores_data.begin());
174 
175   // Data structure for a selection candidate in NMS.
176   struct Candidate {
177     int box_index;
178     T score;
179     int suppress_begin_index;
180   };
181 
182   auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
183     return ((bs_i.score == bs_j.score) && (bs_i.box_index > bs_j.box_index)) ||
184            bs_i.score < bs_j.score;
185   };
186   std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)>
187       candidate_priority_queue(cmp);
188   for (int i = 0; i < scores_data.size(); ++i) {
189     if (scores_data[i] > score_threshold) {
190       candidate_priority_queue.emplace(Candidate({i, scores_data[i], 0}));
191     }
192   }
193 
194   T scale = static_cast<T>(0.0);
195   bool is_soft_nms = soft_nms_sigma > static_cast<T>(0.0);
196   if (is_soft_nms) {
197     scale = static_cast<T>(-0.5) / soft_nms_sigma;
198   }
199 
200   auto suppress_weight = [similarity_threshold, scale,
201                           is_soft_nms](const T sim) {
202     const T weight =
203         static_cast<T>(std::exp(static_cast<float>(scale * sim * sim)));
204     return is_soft_nms || sim <= similarity_threshold ? weight
205                                                       : static_cast<T>(0.0);
206   };
207 
208   std::vector<int> selected;
209   std::vector<T> selected_scores;
210   T similarity, original_score;
211   Candidate next_candidate;
212 
213   while (selected.size() < output_size && !candidate_priority_queue.empty()) {
214     next_candidate = candidate_priority_queue.top();
215     original_score = next_candidate.score;
216     candidate_priority_queue.pop();
217 
218     // Overlapping boxes are likely to have similar scores, therefore we
219     // iterate through the previously selected boxes backwards in order to
220     // see if `next_candidate` should be suppressed. We also enforce a property
221     // that a candidate can be suppressed by another candidate no more than
222     // once via `suppress_begin_index` which tracks which previously selected
223     // boxes have already been compared against next_candidate prior to a given
224     // iteration.  These previous selected boxes are then skipped over in the
225     // following loop.
226     bool should_hard_suppress = false;
227     for (int j = static_cast<int>(selected.size()) - 1;
228          j >= next_candidate.suppress_begin_index; --j) {
229       similarity = similarity_fn(next_candidate.box_index, selected[j]);
230 
231       next_candidate.score *= suppress_weight(similarity);
232 
233       // First decide whether to perform hard suppression
234       if (!is_soft_nms && similarity > static_cast<T>(similarity_threshold)) {
235         should_hard_suppress = true;
236         break;
237       }
238 
239       // If next_candidate survives hard suppression, apply soft suppression
240       if (next_candidate.score <= score_threshold) break;
241     }
242     // If `next_candidate.score` has not dropped below `score_threshold`
243     // by this point, then we know that we went through all of the previous
244     // selections and can safely update `suppress_begin_index` to
245     // `selected.size()`. If on the other hand `next_candidate.score`
246     // *has* dropped below the score threshold, then since `suppress_weight`
247     // always returns values in [0, 1], further suppression by items that were
248     // not covered in the above for loop would not have caused the algorithm
249     // to select this item. We thus do the same update to
250     // `suppress_begin_index`, but really, this element will not be added back
251     // into the priority queue in the following.
252     next_candidate.suppress_begin_index = selected.size();
253 
254     if (!should_hard_suppress) {
255       if (next_candidate.score == original_score) {
256         // Suppression has not occurred, so select next_candidate
257         selected.push_back(next_candidate.box_index);
258         selected_scores.push_back(next_candidate.score);
259         continue;
260       }
261       if (next_candidate.score > score_threshold) {
262         // Soft suppression has occurred and current score is still greater than
263         // score_threshold; add next_candidate back onto priority queue.
264         candidate_priority_queue.push(next_candidate);
265       }
266     }
267   }
268 
269   int num_valid_outputs = selected.size();
270   if (pad_to_max_output_size) {
271     selected.resize(output_size, 0);
272     selected_scores.resize(output_size, static_cast<T>(0));
273   }
274   if (ptr_num_valid_outputs) {
275     *ptr_num_valid_outputs = num_valid_outputs;
276   }
277 
278   // Allocate output tensors
279   Tensor* output_indices = nullptr;
280   TensorShape output_shape({static_cast<int>(selected.size())});
281   OP_REQUIRES_OK(context,
282                  context->allocate_output(0, output_shape, &output_indices));
283   TTypes<int, 1>::Tensor output_indices_data = output_indices->tensor<int, 1>();
284   std::copy_n(selected.begin(), selected.size(), output_indices_data.data());
285 
286   if (return_scores_tensor) {
287     Tensor* output_scores = nullptr;
288     OP_REQUIRES_OK(context,
289                    context->allocate_output(1, output_shape, &output_scores));
290     typename TTypes<T, 1>::Tensor output_scores_data =
291         output_scores->tensor<T, 1>();
292     std::copy_n(selected_scores.begin(), selected_scores.size(),
293                 output_scores_data.data());
294   }
295 }
296 
297 struct ResultCandidate {
298   int box_index;
299   float score;
300   int class_idx;
301   float box_coord[4];
302 };
303 
DoNMSPerClass(int batch_idx,int class_idx,const float * boxes_data,const float * scores_data,int num_boxes,int q,int num_classes,const int size_per_class,const float score_threshold,const float iou_threshold,std::vector<ResultCandidate> & result_candidate_vec)304 void DoNMSPerClass(int batch_idx, int class_idx, const float* boxes_data,
305                    const float* scores_data, int num_boxes, int q,
306                    int num_classes, const int size_per_class,
307                    const float score_threshold, const float iou_threshold,
308                    std::vector<ResultCandidate>& result_candidate_vec) {
309   std::vector<float> class_scores_data;
310   class_scores_data.reserve(num_boxes);
311   std::vector<float> class_boxes_data;
312   class_boxes_data.reserve(num_boxes * 4);
313 
314   for (int box_idx = 0; box_idx < num_boxes; ++box_idx) {
315     class_scores_data.push_back(scores_data[box_idx * num_classes + class_idx]);
316     for (int cid = 0; cid < 4; ++cid) {
317       if (q > 1) {
318         class_boxes_data.push_back(
319             boxes_data[(box_idx * q + class_idx) * 4 + cid]);
320       } else {
321         class_boxes_data.push_back(boxes_data[box_idx * 4 + cid]);
322       }
323     }
324   }
325 
326   // Do NMS, get the candidate indices of form vector<int>
327   // Data structure for selection candidate in NMS.
328   struct Candidate {
329     int box_index;
330     float score;
331   };
332   auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
333     return bs_i.score < bs_j.score;
334   };
335   std::priority_queue<Candidate, std::vector<Candidate>, decltype(cmp)>
336       candidate_priority_queue(cmp);
337   for (int i = 0; i < num_boxes; ++i) {
338     if (class_scores_data[i] > score_threshold) {
339       candidate_priority_queue.emplace(Candidate({i, class_scores_data[i]}));
340     }
341   }
342 
343   std::vector<int> selected;
344   std::vector<float> selected_boxes;
345   Candidate next_candidate;
346 
347   // Move class_boxes_data to a tensor
348   Eigen::array<Eigen::DenseIndex, 2> boxesShape = {num_boxes, 4};
349   typename TTypes<float, 2>::ConstTensor boxes_data_t(class_boxes_data.data(),
350                                                       boxesShape);
351   float iou;
352   while (selected.size() < size_per_class &&
353          !candidate_priority_queue.empty()) {
354     next_candidate = candidate_priority_queue.top();
355     candidate_priority_queue.pop();
356     // Overlapping boxes are likely to have similar scores,
357     // therefore we iterate through the previously selected boxes backwards
358     // in order to see if `next_candidate` should be suppressed.
359     bool should_select = true;
360     for (int j = selected.size() - 1; j >= 0; --j) {
361       iou = IOU<float>(boxes_data_t, next_candidate.box_index, selected[j]);
362       if (iou > iou_threshold) {
363         should_select = false;
364         break;
365       }
366     }
367 
368     if (should_select) {
369       // Add the selected box to the result candidate. Sorted by score
370       int id = next_candidate.box_index;
371       result_candidate_vec[selected.size() + size_per_class * class_idx] = {
372           next_candidate.box_index,
373           next_candidate.score,
374           class_idx,
375           {boxes_data_t(id, 0), boxes_data_t(id, 1), boxes_data_t(id, 2),
376            boxes_data_t(id, 3)}};
377       selected.push_back(next_candidate.box_index);
378     }
379   }
380 }
381 
SelectResultPerBatch(std::vector<float> & nmsed_boxes,std::vector<float> & nmsed_scores,std::vector<float> & nmsed_classes,std::vector<ResultCandidate> & result_candidate_vec,std::vector<int> & final_valid_detections,const int batch_idx,int total_size_per_batch,bool pad_per_class,int max_size_per_batch,bool clip_boxes,int per_batch_size)382 void SelectResultPerBatch(std::vector<float>& nmsed_boxes,
383                           std::vector<float>& nmsed_scores,
384                           std::vector<float>& nmsed_classes,
385                           std::vector<ResultCandidate>& result_candidate_vec,
386                           std::vector<int>& final_valid_detections,
387                           const int batch_idx, int total_size_per_batch,
388                           bool pad_per_class, int max_size_per_batch,
389                           bool clip_boxes, int per_batch_size) {
390   auto rc_cmp = [](const ResultCandidate rc_i, const ResultCandidate rc_j) {
391     return rc_i.score > rc_j.score;
392   };
393   std::sort(result_candidate_vec.begin(), result_candidate_vec.end(), rc_cmp);
394 
395   int max_detections = 0;
396   int result_candidate_size =
397       std::count_if(result_candidate_vec.begin(), result_candidate_vec.end(),
398                     [](ResultCandidate rc) { return rc.box_index > -1; });
399   // If pad_per_class is false, we always pad to max_total_size
400   if (!pad_per_class) {
401     max_detections = std::min(result_candidate_size, total_size_per_batch);
402   } else {
403     max_detections = std::min(per_batch_size, result_candidate_size);
404   }
405 
406   final_valid_detections[batch_idx] = max_detections;
407 
408   int curr_total_size = max_detections;
409   int result_idx = 0;
410   // Pick the top max_detections values
411   while (curr_total_size > 0 && result_idx < result_candidate_vec.size()) {
412     ResultCandidate next_candidate = result_candidate_vec[result_idx++];
413     // Add to final output vectors
414     if (clip_boxes) {
415       const float box_min = 0.0;
416       const float box_max = 1.0;
417       nmsed_boxes.push_back(
418           std::max(std::min(next_candidate.box_coord[0], box_max), box_min));
419       nmsed_boxes.push_back(
420           std::max(std::min(next_candidate.box_coord[1], box_max), box_min));
421       nmsed_boxes.push_back(
422           std::max(std::min(next_candidate.box_coord[2], box_max), box_min));
423       nmsed_boxes.push_back(
424           std::max(std::min(next_candidate.box_coord[3], box_max), box_min));
425     } else {
426       nmsed_boxes.push_back(next_candidate.box_coord[0]);
427       nmsed_boxes.push_back(next_candidate.box_coord[1]);
428       nmsed_boxes.push_back(next_candidate.box_coord[2]);
429       nmsed_boxes.push_back(next_candidate.box_coord[3]);
430     }
431     nmsed_scores.push_back(next_candidate.score);
432     nmsed_classes.push_back(next_candidate.class_idx);
433     curr_total_size--;
434   }
435 
436   nmsed_boxes.resize(per_batch_size * 4, 0);
437   nmsed_scores.resize(per_batch_size, 0);
438   nmsed_classes.resize(per_batch_size, 0);
439 }
440 
BatchedNonMaxSuppressionOp(OpKernelContext * context,const Tensor & inp_boxes,const Tensor & inp_scores,int num_boxes,const int max_size_per_class,const int total_size_per_batch,const float score_threshold,const float iou_threshold,bool pad_per_class=false,bool clip_boxes=true)441 void BatchedNonMaxSuppressionOp(
442     OpKernelContext* context, const Tensor& inp_boxes, const Tensor& inp_scores,
443     int num_boxes, const int max_size_per_class, const int total_size_per_batch,
444     const float score_threshold, const float iou_threshold,
445     bool pad_per_class = false, bool clip_boxes = true) {
446   const int num_batches = inp_boxes.dim_size(0);
447   int num_classes = inp_scores.dim_size(2);
448   int q = inp_boxes.dim_size(2);
449 
450   const float* scores_data =
451       const_cast<float*>(inp_scores.flat<float>().data());
452   const float* boxes_data = const_cast<float*>(inp_boxes.flat<float>().data());
453 
454   int boxes_per_batch = num_boxes * q * 4;
455   int scores_per_batch = num_boxes * num_classes;
456   const int size_per_class = std::min(max_size_per_class, num_boxes);
457   std::vector<std::vector<ResultCandidate>> result_candidate_vec(
458       num_batches,
459       std::vector<ResultCandidate>(size_per_class * num_classes,
460                                    {-1, -1.0, -1, {0.0, 0.0, 0.0, 0.0}}));
461 
462   // [num_batches, per_batch_size * 4]
463   std::vector<std::vector<float>> nmsed_boxes(num_batches);
464   // [num_batches, per_batch_size]
465   std::vector<std::vector<float>> nmsed_scores(num_batches);
466   // [num_batches, per_batch_size]
467   std::vector<std::vector<float>> nmsed_classes(num_batches);
468   // [num_batches]
469   std::vector<int> final_valid_detections(num_batches);
470 
471   auto shard_nms = [&](int begin, int end) {
472     for (int idx = begin; idx < end; ++idx) {
473       int batch_idx = idx / num_classes;
474       int class_idx = idx % num_classes;
475       DoNMSPerClass(batch_idx, class_idx,
476                     boxes_data + boxes_per_batch * batch_idx,
477                     scores_data + scores_per_batch * batch_idx, num_boxes, q,
478                     num_classes, size_per_class, score_threshold, iou_threshold,
479                     result_candidate_vec[batch_idx]);
480     }
481   };
482 
483   int length = num_batches * num_classes;
484   // Input data boxes_data, scores_data
485   int input_bytes = num_boxes * 10 * sizeof(float);
486   int output_bytes = num_boxes * 10 * sizeof(float);
487   int compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 14 +
488                        Eigen::TensorOpCost::MulCost<int>() * num_boxes * 9 +
489                        Eigen::TensorOpCost::MulCost<float>() * num_boxes * 9 +
490                        Eigen::TensorOpCost::AddCost<float>() * num_boxes * 8;
491   // The cost here is not the actual number of cycles, but rather a set of
492   // hand-tuned numbers that seem to work best.
493   const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
494   const CPUDevice& d = context->eigen_device<CPUDevice>();
495   d.parallelFor(length, cost, shard_nms);
496 
497   int per_batch_size = total_size_per_batch;
498   if (pad_per_class) {
499     per_batch_size =
500         std::min(total_size_per_batch, max_size_per_class * num_classes);
501   }
502 
503   Tensor* valid_detections_t = nullptr;
504   TensorShape valid_detections_shape({num_batches});
505   OP_REQUIRES_OK(context, context->allocate_output(3, valid_detections_shape,
506                                                    &valid_detections_t));
507   auto valid_detections_flat = valid_detections_t->template flat<int>();
508 
509   auto shard_result = [&](int begin, int end) {
510     for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
511       SelectResultPerBatch(
512           nmsed_boxes[batch_idx], nmsed_scores[batch_idx],
513           nmsed_classes[batch_idx], result_candidate_vec[batch_idx],
514           final_valid_detections, batch_idx, total_size_per_batch,
515           pad_per_class, max_size_per_class * num_classes, clip_boxes,
516           per_batch_size);
517       valid_detections_flat(batch_idx) = final_valid_detections[batch_idx];
518     }
519   };
520   length = num_batches;
521   // Input data boxes_data, scores_data
522   input_bytes =
523       num_boxes * 10 * sizeof(float) + per_batch_size * 6 * sizeof(float);
524   output_bytes =
525       num_boxes * 5 * sizeof(float) + per_batch_size * 6 * sizeof(float);
526   compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 5 +
527                    Eigen::TensorOpCost::AddCost<float>() * num_boxes * 5;
528   // The cost here is not the actual number of cycles, but rather a set of
529   // hand-tuned numbers that seem to work best.
530   const Eigen::TensorOpCost cost_result(input_bytes, output_bytes,
531                                         compute_cycles);
532   d.parallelFor(length, cost_result, shard_result);
533 
534   Tensor* nmsed_boxes_t = nullptr;
535   TensorShape boxes_shape({num_batches, per_batch_size, 4});
536   OP_REQUIRES_OK(context,
537                  context->allocate_output(0, boxes_shape, &nmsed_boxes_t));
538   auto nmsed_boxes_flat = nmsed_boxes_t->template flat<float>();
539 
540   Tensor* nmsed_scores_t = nullptr;
541   TensorShape scores_shape({num_batches, per_batch_size});
542   OP_REQUIRES_OK(context,
543                  context->allocate_output(1, scores_shape, &nmsed_scores_t));
544   auto nmsed_scores_flat = nmsed_scores_t->template flat<float>();
545 
546   Tensor* nmsed_classes_t = nullptr;
547   OP_REQUIRES_OK(context,
548                  context->allocate_output(2, scores_shape, &nmsed_classes_t));
549   auto nmsed_classes_flat = nmsed_classes_t->template flat<float>();
550 
551   auto shard_copy_result = [&](int begin, int end) {
552     for (int idx = begin; idx < end; ++idx) {
553       int batch_idx = idx / per_batch_size;
554       int j = idx % per_batch_size;
555       nmsed_scores_flat(idx) = nmsed_scores[batch_idx][j];
556       nmsed_classes_flat(idx) = nmsed_classes[batch_idx][j];
557       for (int k = 0; k < 4; ++k) {
558         nmsed_boxes_flat(idx * 4 + k) = nmsed_boxes[batch_idx][j * 4 + k];
559       }
560     }
561   };
562   length = num_batches * per_batch_size;
563   // Input data boxes_data, scores_data
564   input_bytes = 6 * sizeof(float);
565   output_bytes = 6 * sizeof(float);
566   compute_cycles = Eigen::TensorOpCost::AddCost<int>() * 2 +
567                    Eigen::TensorOpCost::MulCost<int>() * 2 +
568                    Eigen::TensorOpCost::DivCost<float>() * 2;
569   const Eigen::TensorOpCost cost_copy_result(input_bytes, output_bytes,
570                                              compute_cycles);
571   d.parallelFor(length, cost_copy_result, shard_copy_result);
572 }
573 
574 }  // namespace
575 
576 template <typename Device>
577 class NonMaxSuppressionOp : public OpKernel {
578  public:
NonMaxSuppressionOp(OpKernelConstruction * context)579   explicit NonMaxSuppressionOp(OpKernelConstruction* context)
580       : OpKernel(context) {
581     OP_REQUIRES_OK(context, context->GetAttr("iou_threshold", &iou_threshold_));
582   }
583 
Compute(OpKernelContext * context)584   void Compute(OpKernelContext* context) override {
585     // boxes: [num_boxes, 4]
586     const Tensor& boxes = context->input(0);
587     // scores: [num_boxes]
588     const Tensor& scores = context->input(1);
589     // max_output_size: scalar
590     const Tensor& max_output_size = context->input(2);
591     OP_REQUIRES(
592         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
593         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
594                                 max_output_size.shape().DebugString()));
595 
596     OP_REQUIRES(context, iou_threshold_ >= 0 && iou_threshold_ <= 1,
597                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
598     int num_boxes = 0;
599     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
600     CheckScoreSizes(context, num_boxes, scores);
601     if (!context->status().ok()) {
602       return;
603     }
604     auto similarity_fn = CreateIOUSimilarityFn<float>(boxes);
605 
606     const float score_threshold_val = std::numeric_limits<float>::lowest();
607     const float dummy_soft_nms_sigma = static_cast<float>(0.0);
608     DoNonMaxSuppressionOp<float>(context, scores, num_boxes, max_output_size,
609                                  iou_threshold_, score_threshold_val,
610                                  dummy_soft_nms_sigma, similarity_fn);
611   }
612 
613  private:
614   float iou_threshold_;
615 };
616 
617 template <typename Device, typename T>
618 class NonMaxSuppressionV2Op : public OpKernel {
619  public:
NonMaxSuppressionV2Op(OpKernelConstruction * context)620   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
621       : OpKernel(context) {}
622 
Compute(OpKernelContext * context)623   void Compute(OpKernelContext* context) override {
624     // boxes: [num_boxes, 4]
625     const Tensor& boxes = context->input(0);
626     // scores: [num_boxes]
627     const Tensor& scores = context->input(1);
628     // max_output_size: scalar
629     const Tensor& max_output_size = context->input(2);
630     OP_REQUIRES(
631         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
632         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
633                                 max_output_size.shape().DebugString()));
634     // iou_threshold: scalar
635     const Tensor& iou_threshold = context->input(3);
636     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
637                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
638                                         iou_threshold.shape().DebugString()));
639     const T iou_threshold_val = iou_threshold.scalar<T>()();
640 
641     OP_REQUIRES(context,
642                 iou_threshold_val >= static_cast<T>(0.0) &&
643                     iou_threshold_val <= static_cast<T>(1.0),
644                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
645     int num_boxes = 0;
646     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
647     CheckScoreSizes(context, num_boxes, scores);
648     if (!context->status().ok()) {
649       return;
650     }
651     auto similarity_fn = CreateIOUSimilarityFn<T>(boxes);
652 
653     const T score_threshold_val = std::numeric_limits<T>::lowest();
654     const T dummy_soft_nms_sigma = static_cast<T>(0.0);
655     DoNonMaxSuppressionOp<T>(context, scores, num_boxes, max_output_size,
656                              iou_threshold_val, score_threshold_val,
657                              dummy_soft_nms_sigma, similarity_fn);
658   }
659 };
660 
661 template <typename Device, typename T>
662 class NonMaxSuppressionV3Op : public OpKernel {
663  public:
NonMaxSuppressionV3Op(OpKernelConstruction * context)664   explicit NonMaxSuppressionV3Op(OpKernelConstruction* context)
665       : OpKernel(context) {}
666 
Compute(OpKernelContext * context)667   void Compute(OpKernelContext* context) override {
668     // boxes: [num_boxes, 4]
669     const Tensor& boxes = context->input(0);
670     // scores: [num_boxes]
671     const Tensor& scores = context->input(1);
672     // max_output_size: scalar
673     const Tensor& max_output_size = context->input(2);
674     OP_REQUIRES(
675         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
676         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
677                                 max_output_size.shape().DebugString(),
678                                 " (Shape must be rank 0 but is ", "rank ",
679                                 max_output_size.dims(), ")"));
680     // iou_threshold: scalar
681     const Tensor& iou_threshold = context->input(3);
682     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
683                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
684                                         iou_threshold.shape().DebugString(),
685                                         " (Shape must be rank 0 but is rank ",
686                                         iou_threshold.dims(), ")"));
687     const T iou_threshold_val = iou_threshold.scalar<T>()();
688     OP_REQUIRES(context,
689                 iou_threshold_val >= static_cast<T>(0.0) &&
690                     iou_threshold_val <= static_cast<T>(1.0),
691                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
692     // score_threshold: scalar
693     const Tensor& score_threshold = context->input(4);
694     OP_REQUIRES(
695         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
696         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
697                                 score_threshold.shape().DebugString()));
698     const T score_threshold_val = score_threshold.scalar<T>()();
699 
700     int num_boxes = 0;
701     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
702     CheckScoreSizes(context, num_boxes, scores);
703     if (!context->status().ok()) {
704       return;
705     }
706 
707     auto similarity_fn = CreateIOUSimilarityFn<T>(boxes);
708 
709     const T dummy_soft_nms_sigma = static_cast<T>(0.0);
710     DoNonMaxSuppressionOp<T>(context, scores, num_boxes, max_output_size,
711                              iou_threshold_val, score_threshold_val,
712                              dummy_soft_nms_sigma, similarity_fn);
713   }
714 };
715 
716 template <typename Device, typename T>
717 class NonMaxSuppressionV4Op : public OpKernel {
718  public:
NonMaxSuppressionV4Op(OpKernelConstruction * context)719   explicit NonMaxSuppressionV4Op(OpKernelConstruction* context)
720       : OpKernel(context) {
721     OP_REQUIRES_OK(context, context->GetAttr("pad_to_max_output_size",
722                                              &pad_to_max_output_size_));
723   }
724 
Compute(OpKernelContext * context)725   void Compute(OpKernelContext* context) override {
726     // boxes: [num_boxes, 4]
727     const Tensor& boxes = context->input(0);
728     // scores: [num_boxes]
729     const Tensor& scores = context->input(1);
730     // max_output_size: scalar
731     const Tensor& max_output_size = context->input(2);
732     OP_REQUIRES(
733         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
734         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
735                                 max_output_size.shape().DebugString()));
736     // iou_threshold: scalar
737     const Tensor& iou_threshold = context->input(3);
738     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
739                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
740                                         iou_threshold.shape().DebugString()));
741     const T iou_threshold_val = iou_threshold.scalar<T>()();
742     OP_REQUIRES(context,
743                 iou_threshold_val >= static_cast<T>(0.0) &&
744                     iou_threshold_val <= static_cast<T>(1.0),
745                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
746     // score_threshold: scalar
747     const Tensor& score_threshold = context->input(4);
748     OP_REQUIRES(
749         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
750         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
751                                 score_threshold.shape().DebugString()));
752     const T score_threshold_val = score_threshold.scalar<T>()();
753 
754     int num_boxes = 0;
755     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
756     CheckScoreSizes(context, num_boxes, scores);
757     if (!context->status().ok()) {
758       return;
759     }
760 
761     auto similarity_fn = CreateIOUSimilarityFn<T>(boxes);
762     int num_valid_outputs;
763 
764     bool return_scores_tensor_ = false;
765     const T dummy_soft_nms_sigma = static_cast<T>(0.0);
766     DoNonMaxSuppressionOp<T>(
767         context, scores, num_boxes, max_output_size, iou_threshold_val,
768         score_threshold_val, dummy_soft_nms_sigma, similarity_fn,
769         return_scores_tensor_, pad_to_max_output_size_, &num_valid_outputs);
770 
771     // Allocate scalar output tensor for number of indices computed.
772     Tensor* num_outputs_t = nullptr;
773     OP_REQUIRES_OK(context, context->allocate_output(
774                                 1, tensorflow::TensorShape{}, &num_outputs_t));
775     num_outputs_t->scalar<int32>().setConstant(num_valid_outputs);
776   }
777 
778  private:
779   bool pad_to_max_output_size_;
780 };
781 
782 template <typename Device, typename T>
783 class NonMaxSuppressionV5Op : public OpKernel {
784  public:
NonMaxSuppressionV5Op(OpKernelConstruction * context)785   explicit NonMaxSuppressionV5Op(OpKernelConstruction* context)
786       : OpKernel(context) {
787     OP_REQUIRES_OK(context, context->GetAttr("pad_to_max_output_size",
788                                              &pad_to_max_output_size_));
789   }
790 
Compute(OpKernelContext * context)791   void Compute(OpKernelContext* context) override {
792     // boxes: [num_boxes, 4]
793     const Tensor& boxes = context->input(0);
794     // scores: [num_boxes]
795     const Tensor& scores = context->input(1);
796     // max_output_size: scalar
797     const Tensor& max_output_size = context->input(2);
798     OP_REQUIRES(
799         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
800         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
801                                 max_output_size.shape().DebugString()));
802     // iou_threshold: scalar
803     const Tensor& iou_threshold = context->input(3);
804     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
805                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
806                                         iou_threshold.shape().DebugString()));
807     const T iou_threshold_val = iou_threshold.scalar<T>()();
808     OP_REQUIRES(context,
809                 iou_threshold_val >= static_cast<T>(0.0) &&
810                     iou_threshold_val <= static_cast<T>(1.0),
811                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
812     // score_threshold: scalar
813     const Tensor& score_threshold = context->input(4);
814     OP_REQUIRES(
815         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
816         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
817                                 score_threshold.shape().DebugString()));
818     const T score_threshold_val = score_threshold.scalar<T>()();
819 
820     // soft_nms_sigma: scalar
821     const Tensor& soft_nms_sigma = context->input(5);
822     OP_REQUIRES(
823         context, TensorShapeUtils::IsScalar(soft_nms_sigma.shape()),
824         errors::InvalidArgument("soft_nms_sigma must be 0-D, got shape ",
825                                 soft_nms_sigma.shape().DebugString()));
826     const T soft_nms_sigma_val = soft_nms_sigma.scalar<T>()();
827     OP_REQUIRES(context, soft_nms_sigma_val >= static_cast<T>(0.0),
828                 errors::InvalidArgument("soft_nms_sigma_val must be >= 0"));
829 
830     int num_boxes = 0;
831     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
832     CheckScoreSizes(context, num_boxes, scores);
833     if (!context->status().ok()) {
834       return;
835     }
836 
837     auto similarity_fn = CreateIOUSimilarityFn<T>(boxes);
838     int num_valid_outputs;
839 
840     // For NonMaxSuppressionV5Op, we always return a second output holding
841     // corresponding scores, so `return_scores_tensor` should never be false.
842     const bool return_scores_tensor_ = true;
843     DoNonMaxSuppressionOp<T>(
844         context, scores, num_boxes, max_output_size, iou_threshold_val,
845         score_threshold_val, soft_nms_sigma_val, similarity_fn,
846         return_scores_tensor_, pad_to_max_output_size_, &num_valid_outputs);
847 
848     // Allocate scalar output tensor for number of indices computed.
849     Tensor* num_outputs_t = nullptr;
850     OP_REQUIRES_OK(context, context->allocate_output(
851                                 2, tensorflow::TensorShape{}, &num_outputs_t));
852     num_outputs_t->scalar<int32>().setConstant(num_valid_outputs);
853   }
854 
855  private:
856   bool pad_to_max_output_size_;
857 };
858 
859 template <typename Device>
860 class NonMaxSuppressionWithOverlapsOp : public OpKernel {
861  public:
NonMaxSuppressionWithOverlapsOp(OpKernelConstruction * context)862   explicit NonMaxSuppressionWithOverlapsOp(OpKernelConstruction* context)
863       : OpKernel(context) {}
864 
Compute(OpKernelContext * context)865   void Compute(OpKernelContext* context) override {
866     // overlaps: [num_boxes, num_boxes]
867     const Tensor& overlaps = context->input(0);
868     // scores: [num_boxes]
869     const Tensor& scores = context->input(1);
870     // max_output_size: scalar
871     const Tensor& max_output_size = context->input(2);
872     OP_REQUIRES(
873         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
874         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
875                                 max_output_size.shape().DebugString()));
876     // overlap_threshold: scalar
877     const Tensor& overlap_threshold = context->input(3);
878     OP_REQUIRES(
879         context, TensorShapeUtils::IsScalar(overlap_threshold.shape()),
880         errors::InvalidArgument("overlap_threshold must be 0-D, got shape ",
881                                 overlap_threshold.shape().DebugString()));
882     const float overlap_threshold_val = overlap_threshold.scalar<float>()();
883 
884     // score_threshold: scalar
885     const Tensor& score_threshold = context->input(4);
886     OP_REQUIRES(
887         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
888         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
889                                 score_threshold.shape().DebugString()));
890     const float score_threshold_val = score_threshold.scalar<float>()();
891 
892     int num_boxes = 0;
893     ParseAndCheckOverlapSizes(context, overlaps, &num_boxes);
894     CheckScoreSizes(context, num_boxes, scores);
895     if (!context->status().ok()) {
896       return;
897     }
898     auto similarity_fn = CreateOverlapSimilarityFn<float>(overlaps);
899 
900     const float dummy_soft_nms_sigma = static_cast<float>(0.0);
901     DoNonMaxSuppressionOp<float>(context, scores, num_boxes, max_output_size,
902                                  overlap_threshold_val, score_threshold_val,
903                                  dummy_soft_nms_sigma, similarity_fn);
904   }
905 };
906 
907 template <typename Device>
908 class CombinedNonMaxSuppressionOp : public OpKernel {
909  public:
CombinedNonMaxSuppressionOp(OpKernelConstruction * context)910   explicit CombinedNonMaxSuppressionOp(OpKernelConstruction* context)
911       : OpKernel(context) {
912     OP_REQUIRES_OK(context, context->GetAttr("pad_per_class", &pad_per_class_));
913     OP_REQUIRES_OK(context, context->GetAttr("clip_boxes", &clip_boxes_));
914   }
915 
Compute(OpKernelContext * context)916   void Compute(OpKernelContext* context) override {
917     // boxes: [batch_size, num_anchors, q, 4]
918     const Tensor& boxes = context->input(0);
919     // scores: [batch_size, num_anchors, num_classes]
920     const Tensor& scores = context->input(1);
921     OP_REQUIRES(
922         context, (boxes.dim_size(0) == scores.dim_size(0)),
923         errors::InvalidArgument("boxes and scores must have same batch size"));
924 
925     // max_output_size: scalar
926     const Tensor& max_output_size = context->input(2);
927     OP_REQUIRES(
928         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
929         errors::InvalidArgument("max_size_per_class must be 0-D, got shape ",
930                                 max_output_size.shape().DebugString()));
931     const int max_size_per_class = max_output_size.scalar<int>()();
932     // max_total_size: scalar
933     const Tensor& max_total_size = context->input(3);
934     OP_REQUIRES(
935         context, TensorShapeUtils::IsScalar(max_total_size.shape()),
936         errors::InvalidArgument("max_total_size must be 0-D, got shape ",
937                                 max_total_size.shape().DebugString()));
938     const int max_total_size_per_batch = max_total_size.scalar<int>()();
939     OP_REQUIRES(context, max_total_size_per_batch > 0,
940                 errors::InvalidArgument("max_total_size must be > 0"));
941     // Throw warning when `max_total_size` is too large as it may cause OOM.
942     if (max_total_size_per_batch > pow(10, 6)) {
943       LOG(WARNING) << "Detected a large value for `max_total_size`. This may "
944                    << "cause OOM error. (max_total_size: "
945                    << max_total_size.scalar<int>()() << ")";
946     }
947     // iou_threshold: scalar
948     const Tensor& iou_threshold = context->input(4);
949     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
950                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
951                                         iou_threshold.shape().DebugString()));
952     const float iou_threshold_val = iou_threshold.scalar<float>()();
953 
954     // score_threshold: scalar
955     const Tensor& score_threshold = context->input(5);
956     OP_REQUIRES(
957         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
958         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
959                                 score_threshold.shape().DebugString()));
960     const float score_threshold_val = score_threshold.scalar<float>()();
961 
962     OP_REQUIRES(context, iou_threshold_val >= 0 && iou_threshold_val <= 1,
963                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
964     int num_boxes = 0;
965     const int num_classes = scores.dim_size(2);
966     ParseAndCheckCombinedNMSBoxSizes(context, boxes, &num_boxes, num_classes);
967     CheckCombinedNMSScoreSizes(context, num_boxes, scores);
968 
969     if (!context->status().ok()) {
970       return;
971     }
972     BatchedNonMaxSuppressionOp(context, boxes, scores, num_boxes,
973                                max_size_per_class, max_total_size_per_batch,
974                                score_threshold_val, iou_threshold_val,
975                                pad_per_class_, clip_boxes_);
976   }
977 
978  private:
979   bool pad_per_class_;
980   bool clip_boxes_;
981 };
982 
983 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
984                         NonMaxSuppressionOp<CPUDevice>);
985 
986 REGISTER_KERNEL_BUILDER(
987     Name("NonMaxSuppressionV2").TypeConstraint<float>("T").Device(DEVICE_CPU),
988     NonMaxSuppressionV2Op<CPUDevice, float>);
989 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2")
990                             .TypeConstraint<Eigen::half>("T")
991                             .Device(DEVICE_CPU),
992                         NonMaxSuppressionV2Op<CPUDevice, Eigen::half>);
993 
994 REGISTER_KERNEL_BUILDER(
995     Name("NonMaxSuppressionV3").TypeConstraint<float>("T").Device(DEVICE_CPU),
996     NonMaxSuppressionV3Op<CPUDevice, float>);
997 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3")
998                             .TypeConstraint<Eigen::half>("T")
999                             .Device(DEVICE_CPU),
1000                         NonMaxSuppressionV3Op<CPUDevice, Eigen::half>);
1001 
1002 REGISTER_KERNEL_BUILDER(
1003     Name("NonMaxSuppressionV4").TypeConstraint<float>("T").Device(DEVICE_CPU),
1004     NonMaxSuppressionV4Op<CPUDevice, float>);
1005 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV4")
1006                             .TypeConstraint<Eigen::half>("T")
1007                             .Device(DEVICE_CPU),
1008                         NonMaxSuppressionV4Op<CPUDevice, Eigen::half>);
1009 
1010 REGISTER_KERNEL_BUILDER(
1011     Name("NonMaxSuppressionV5").TypeConstraint<float>("T").Device(DEVICE_CPU),
1012     NonMaxSuppressionV5Op<CPUDevice, float>);
1013 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV5")
1014                             .TypeConstraint<Eigen::half>("T")
1015                             .Device(DEVICE_CPU),
1016                         NonMaxSuppressionV5Op<CPUDevice, Eigen::half>);
1017 
1018 REGISTER_KERNEL_BUILDER(
1019     Name("NonMaxSuppressionWithOverlaps").Device(DEVICE_CPU),
1020     NonMaxSuppressionWithOverlapsOp<CPUDevice>);
1021 
1022 REGISTER_KERNEL_BUILDER(Name("CombinedNonMaxSuppression").Device(DEVICE_CPU),
1023                         CombinedNonMaxSuppressionOp<CPUDevice>);
1024 
1025 }  // namespace tensorflow
1026