1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Neural Net operation support for StreamExecutor instances.
17 //
18 // This is an abstract interface for a platform to optionally support common
19 // neural net operations; it accommodates implementations such as the cudnn
20 // library operations.
21 
22 #ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_
23 #define TENSORFLOW_STREAM_EXECUTOR_DNN_H_
24 
25 #include <functional>
26 #include <limits>
27 #include <memory>
28 #include <tuple>
29 
30 #include "absl/types/optional.h"
31 #include "absl/types/span.h"
32 #include "tensorflow/core/platform/protobuf.h"
33 #include "tensorflow/stream_executor/device_memory.h"
34 #include "tensorflow/stream_executor/dnn.pb.h"
35 #include "tensorflow/stream_executor/lib/array_slice.h"
36 #include "tensorflow/stream_executor/lib/status.h"
37 #include "tensorflow/stream_executor/lib/statusor.h"
38 #include "tensorflow/stream_executor/platform/logging.h"
39 #include "tensorflow/stream_executor/platform/port.h"
40 
41 namespace Eigen {
42 struct half;
43 }  // namespace Eigen
44 
45 namespace stream_executor {
46 
47 class HostBuffer;
48 class Stream;
49 class ScratchAllocator;
50 
51 namespace dnn {
52 
53 // Specifies an index to use when accessing specific spatial dimensions.
54 enum class DimIndex : int {
55   X = 0,
56   Y = 1,
57   Z = 2,
58 };
59 
60 // Helper functions to make methods more readable.
GetDim(absl::Span<const int64> data,DimIndex dim)61 inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) {
62   return data.rbegin()[static_cast<int64>(dim)];
63 }
64 
SetDim(absl::Span<int64> data,DimIndex dim,int64 value)65 inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) {
66   data.rbegin()[static_cast<int64>(dim)] = value;
67 }
68 
SetDim(std::vector<int64> * data,DimIndex dim,int64 value)69 inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) {
70   return SetDim(absl::MakeSpan(*data), dim, value);
71 }
72 
73 // tensorflow::int64 is not the same type as tensorflow::protobuf_int64 in
74 // open-source. Wrapper function that gives an int64 array slice view of a
75 // repeated int64 protobuf field.
AsInt64Slice(const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64> & v)76 inline absl::Span<const int64> AsInt64Slice(
77     const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>& v) {
78   return absl::Span<const int64>(reinterpret_cast<const int64*>(v.data()),
79                                  v.size());
80 }
81 
AsInt64Slice(tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64> * v)82 inline absl::Span<int64> AsInt64Slice(
83     tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* v) {
84   return absl::Span<int64>(reinterpret_cast<int64*>(v->mutable_data()),
85                            v->size());
86 }
87 
88 // Returns a string representation of the given data layout.
89 string DataLayoutString(DataLayout layout);
90 
91 // Specifies a quantization for activations in a given BatchDescriptor.
92 enum class QuantizedActivationMode {
93   k8Bit = 1,
94   k16Bit = 2,
95   k32Bit = 4,
96 };
97 
98 // A helper class to convert C/C++ types to the proper enums.
99 template <typename T>
100 struct ToDataType;
101 template <>
102 struct ToDataType<float> {
103   static constexpr DataType value = DataType::kFloat;
104 };
105 template <>
106 struct ToDataType<double> {
107   static constexpr DataType value = DataType::kDouble;
108 };
109 template <>
110 struct ToDataType<Eigen::half> {
111   static constexpr DataType value = DataType::kHalf;
112 };
113 template <>
114 struct ToDataType<int8> {
115   static constexpr DataType value = DataType::kInt8;
116 };
117 template <>
118 struct ToDataType<int32> {
119   static constexpr DataType value = DataType::kInt32;
120 };
121 
122 // Specifies the types of a RNN model.
123 enum class RnnMode {
124   kRnnRelu = 0,
125   kRnnTanh = 1,
126   kRnnLstm = 2,
127   kRnnGru = 3,
128 };
129 
130 // Specifies the input model and whether there is a linear transformation
131 // between the input state and the first layer hidden state.
132 enum class RnnInputMode {
133   kRnnLinearSkip = 0,
134   kRnnSkipInput = 1,
135 };
136 
137 // Specifies the number of directions used in a RNN model. When bidirection
138 // is used, the input states and output sequence contain data for both
139 // directions.
140 enum class RnnDirectionMode {
141   kRnnUnidirectional = 0,
142   kRnnBidirectional = 1,
143 };
144 
145 // Relevant to DepthToSpace and SpaceToDepth. This is the write layout when
146 // performing depth to space and the read layout when performing space to depth.
147 // It's specified with most-major dimension first and most-minor dimension last.
148 // In DepthToSpace, the D*M² values are read in and then, for DepthHeightWidth,
149 // written out to the output patch, by varying first width, then height, then
150 // depth. In C array format, it looks like [depth][height][width]. See
151 // DepthToSpace comment for more information.
152 enum class DepthToSpaceLayout { DepthHeightWidth };
153 
154 // Specifies the descriptor for a RNN model.
155 //
156 // An example use case:
157 //   * The user first creates a model through createRnnDescriptor.
158 //   * The user queries the size of the underlying opaque parameter buffer.
159 //   * The user creates and initializes a parameter buffer of the proper size.
160 //   * The user runs forward and backward operations using this RNN descriptor.
161 //   * Once a while, user queries maintainable weights and bias regions from
162 //       the underlying parameter buffer. They are more likely to be forward
163 //       compatible and should used in saving and restoring a model.
164 //   * The user releases the RNN descriptor when the model is no longer in use.
165 class RnnDescriptor {
166  public:
167   struct ParamsRegion {
168     int64 offset;
169     int64 size;
170   };
171   typedef std::vector<ParamsRegion> ParamsRegions;
172   virtual ~RnnDescriptor() {}
173   virtual int64 ParamsSizeInBytes() const { return -1; }
174   virtual ParamsRegions ParamsWeightRegions() const { return ParamsRegions(); }
175   virtual ParamsRegions ParamsBiasRegions() const { return ParamsRegions(); }
176 };
177 
178 // Specifies the sequence in a RNN model.
179 //
180 // The user is responsible for releasing this descriptor when it is no longer
181 // in use. The destructor releases the underlying descriptors.
182 class RnnSequenceTensorDescriptor {
183  public:
184   virtual ~RnnSequenceTensorDescriptor() {}
185 };
186 
187 // Specifies either the input and hidden state in a RNN model.
188 //
189 // The user is responsible for releasing this descriptor when it is no longer
190 // in use. The destructor releases the underlying descriptors.
191 class RnnStateTensorDescriptor {
192  public:
193   virtual ~RnnStateTensorDescriptor() {}
194 };
195 
196 // Returns a string representation of the given quantization mode.
197 string QuantizedActivationModeString(QuantizedActivationMode mode);
198 
199 // Describes the dimensions that a layer consumes/produces.
200 //
201 // This is a matrix (height, width), its "depth" (feature_map_count),
202 // how many of these matrices are present (count),
203 // and the maximum and minimum values expected in the matrix (value_max,
204 // value_min).
205 // If input is quantized, all values greater
206 // than value_max will be clipped to value_max and all values less than
207 // value_min will be clipped to value_min.
208 // When quantized output is dequantized no value will be greater than
209 // value_max or less than value_min.
210 //
211 // Uses the named argument construction form:
212 //
213 //  auto input_batch_dimensions =
214 //      BatchDescriptor().set_count(42).set_feature_map_count(7)...
215 //
216 // Details:
217 //
218 // For a convolutional layer, a single inference takes a 3-dimensional matrix
219 // of input and produces a 3-dimensional matrix of output. We call the three
220 // dimensions height, width and feature_map_count, where for an image, the
221 // height and width correspond to the Y and X pixel indices, respectively, and
222 // the feature_map_count corresponds to the RGB dimension of the input data.
223 // Then the count indicates how many 3D matrices are being presented to be
224 // processed at once; this corresponds to the neural network concept of
225 // minibatch size.
226 //
227 // For a fully connected layer, it's better to put the nodes of the layer in
228 // the feature_map_count, and leave the height and weight as degenerate (== 1).
229 // Count indicates how many input vectors (degenerate 3D matrices) are to be
230 // processed.
231 //
232 // If unspecified, value_max and value_min default to 0.0.
233 // If value_max == value_min the Stream will attempt to derive valid values -
234 // for example the output of Relu6 activation will always be in the range
235 // [0.0, 6.0].
236 //
237 // If unspecified, layout defaults to kYXDepthBatch.
238 class BatchDescriptor {
239  public:
240   // Creates a "blank" batch descriptor, which should be initialized via the
241   // named argument helpers.
242   BatchDescriptor();
243   explicit BatchDescriptor(int ndims);
244 
245   // Clones values from 'other' for initialization.
246   void CloneFrom(const BatchDescriptor& other);
247 
248   string ToString() const;
249   string ToShortString() const;
250 
251   // Pre-condition:
252   //   value_max_ == 0
253   //   value_min_ == 0
254   //   quantized_activation_mode_ == QuantizedActivationMode::k8Bit
255   TensorDescriptorProto ToProto(DataType data_type) const;
256 
257   // Accessors.
258   int64 count() const { return tensor_.dimensions(0); }
259   int64 feature_map_count() const { return tensor_.dimensions(1); }
260   int64 height() const { return GetDim(spatial_size(), DimIndex::Y); }
261   int64 width() const { return GetDim(spatial_size(), DimIndex::X); }
262   int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); }
263   int ndims() const { return spatial_size().size(); }
264   float value_max() const { return value_max_; }
265   float value_min() const { return value_min_; }
266   DataLayout layout() const { return tensor_.data_layout(); }
267   QuantizedActivationMode quantized_activation_mode() const {
268     return quantized_activation_mode_;
269   }
270   // Full dimensions of the underlying data, ordered according to a specific
271   // layout.
272   std::vector<int64> full_dims(const DataLayout& layout) const;
273 
274   // Full strides of the underlying data, ordered according to a specific
275   // layout.
276   std::vector<int64> full_strides(const DataLayout& layout) const;
277 
278   // Named-argument helpers for avoiding user error during construction.
279   BatchDescriptor& set_count(int64 value) {
280     tensor_.set_dimensions(0, value);
281     return *this;
282   }
283   BatchDescriptor& set_feature_map_count(int64 value) {
284     tensor_.set_dimensions(1, value);
285     return *this;
286   }
287   BatchDescriptor& set_height(int64 value) {
288     SetDim(spatial_size(), DimIndex::Y, value);
289     return *this;
290   }
291   BatchDescriptor& set_width(int64 value) {
292     SetDim(spatial_size(), DimIndex::X, value);
293     return *this;
294   }
295   BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
296     SetDim(spatial_size(), dim, value);
297     return *this;
298   }
299   BatchDescriptor& set_value_max(float value) {
300     value_max_ = value;
301     return *this;
302   }
303   BatchDescriptor& set_value_min(float value) {
304     value_min_ = value;
305     return *this;
306   }
307   BatchDescriptor& set_layout(DataLayout layout) {
308     tensor_.set_data_layout(layout);
309     return *this;
310   }
311   BatchDescriptor& set_quantized_activation_mode(
312       QuantizedActivationMode quantized_activation_mode) {
313     quantized_activation_mode_ = quantized_activation_mode;
314     return *this;
315   }
316 
317   // Return the number of nodes in a single feature map.
318   int64 NodesPerFeatureMap() const;
319 
320   // Return the number of nodes across all feature maps. Note that this is not
321   // affected by the batch count.
322   int64 NodesAcrossFeatureMaps() const;
323 
324   // Returns the number of elements (e.g. RGB pixel values) required to hold a
325   // given batch descriptor, given a no-padding assumption. Note that this is
326   // affected by the batch count.
327   int64 ElementCount() const;
328 
329   // Return the number of weights required to fully connect a layer with
330   // dimensions given by the 'input' descriptor with a layer with dimensions
331   // given by the 'output' descriptor.
332   static int64 FullyConnectedWeightCount(const BatchDescriptor& input,
333                                          const BatchDescriptor& output);
334 
335   // Return the number of biases required to fully connect to an output layer
336   // with dimensions given the 'output' descriptor.
337   static int64 FullyConnectedBiasCount(const BatchDescriptor& output);
338 
339   // Return a BatchDescriptor for the output of a depth concatenation
340   // with the given input descriptors. The inputs should have the same
341   // dimensions, except possibly for feature_map_count(), though this
342   // function does not verify that.
343   static BatchDescriptor DepthConcatenateOutputDescriptor(
344       port::ArraySlice<dnn::BatchDescriptor> inputs);
345 
346  private:
347   absl::Span<const int64> spatial_size() const {
348     return AsInt64Slice(tensor_.dimensions()).subspan(2);
349   }
350 
351   absl::Span<int64> spatial_size() {
352     return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
353   }
354 
355   TensorDescriptorProto tensor_;
356   float value_max_;
357   float value_min_;
358   QuantizedActivationMode quantized_activation_mode_;
359 };
360 
361 // Returns a string representation of the given filter layout.
362 string FilterLayoutString(FilterLayout layout);
363 
364 // Describes a filter for the convolution. This is the "window" from
365 // height-by-width patches of each of the feature maps in the input layer to the
366 // cells within the output feature map.
367 //
368 // Uses the named argument construction form:
369 //
370 //  FilterDescriptor filter_dimensions;
371 //  filter_dimensions
372 //    .set_output_feature_map_count(42)
373 //    .set_input_feature_map_count(7)
374 //    ...
375 //
376 // Arguments:
377 // - output_feature_map_count: number of feature maps in the output layer.
378 // - input_feature_map_count: number of feature maps in the input layer (from
379 //      which the filter patch is taken).
380 // - input_filter_height: "height" number of neurons used in the sliding window
381 //      over the input layer.
382 // - input_filter_width: "width" number of neurons used in the sliding window
383 //      over the input layer.
384 //
385 // Sometimes names like "filter input height" are referred to by synonymous
386 // terminology, such as "kernel y size".
387 //
388 // If unspecified, layout defaults to kOutputInputYX.
389 class FilterDescriptor {
390  public:
391   // By default construction, all dimensions are set to zero, so they should all
392   // be populated by the user via the named-argument helpers below. (See class
393   // comment for details.)
394   FilterDescriptor();
395   explicit FilterDescriptor(int ndims);
396   ~FilterDescriptor();
397 
398   // Named-argument helpers for avoiding user error during construction.
399   FilterDescriptor& set_output_feature_map_count(int64 value) {
400     tensor_.set_dimensions(0, value);
401     return *this;
402   }
403   FilterDescriptor& set_input_feature_map_count(int64 value) {
404     tensor_.set_dimensions(1, value);
405     return *this;
406   }
407   FilterDescriptor& set_input_filter_height(int64 value) {
408     SetDim(input_filter_dims(), DimIndex::Y, value);
409     return *this;
410   }
411   FilterDescriptor& set_input_filter_width(int64 value) {
412     SetDim(input_filter_dims(), DimIndex::X, value);
413     return *this;
414   }
415   FilterDescriptor& set_layout(FilterLayout layout) {
416     tensor_.set_filter_layout(layout);
417     return *this;
418   }
419   FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
420     SetDim(input_filter_dims(), dim, value);
421     return *this;
422   }
423   int ndims() const { return input_filter_dims().size(); }
424 
425   void CloneFrom(const FilterDescriptor& other);
426 
427   string ToString() const;
428   string ToShortString() const;
429   TensorDescriptorProto ToProto(DataType data_type) const;
430 
431   // Returns the number of weights required as parameters for a convolution
432   // using this filter descriptor.
433   int64 ComputeWeightCount() const;
434 
435   // Returns the number of biases required as parameters for a convolution
436   // using this filter descriptor.
437   int64 bias_count() const { return output_feature_map_count(); }
438 
439   int64 output_feature_map_count() const { return tensor_.dimensions(0); }
440   int64 input_feature_map_count() const { return tensor_.dimensions(1); }
441   int64 input_filter_height() const {
442     return GetDim(input_filter_dims(), DimIndex::Y);
443   }
444   int64 input_filter_width() const {
445     return GetDim(input_filter_dims(), DimIndex::X);
446   }
447   int64 input_filter_dim(DimIndex dim) const {
448     return GetDim(input_filter_dims(), dim);
449   }
450 
451   FilterLayout layout() const { return tensor_.filter_layout(); }
452 
453   absl::Span<const int64> input_filter_dims() const {
454     return AsInt64Slice(tensor_.dimensions()).subspan(2);
455   }
456 
457  private:
458   absl::Span<int64> input_filter_dims() {
459     return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
460   }
461 
462   TensorDescriptorProto tensor_;
463 };
464 
465 // Describes how padding should be aligned when the total number of pad
466 // elements is odd.
467 enum class PadAlignment : int64 {
468   kDefault = 0,        // default padding for the device.
469   kCudnnPadding,       // cuDNN padding - prefer to pad at the start.
470   kTensorFlowPadding,  // TensorFlow padding - prefer to pad at the end.
471 };
472 
473 // Returns a string representation of the given padding alignment.
474 string PadAlignmentString(PadAlignment alignment);
475 
476 // Print alignment to str. Needed to use CHECK_EQ between two PadAlignments.
477 std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
478 
479 // Describes a convolution.
480 //
481 // Uses the named argument construction form:
482 //
483 //  ConvolutionDescriptor convolution_dimensions;
484 //  convolution_dimensions
485 //    .set_vertical_filter_stride(2)
486 //    .set_horizontal_filter_stride(2)
487 //    ...
488 //
489 // Arguments:
490 // - zero_padding_height: padding of the "y dimension" of the input data. Note
491 //    that this is different from the height of the filter.
492 // - zero_padding_width: analogous to the height above, but in the "x
493 //    dimension".
494 // - vertical_filter_stride: the convolution slides a 2-dimensional window of
495 //    filter-height-by-filter-width over the input layer -- the center of that
496 //    window is moved in the "y dimension" according to this stride value.
497 // - horizontal_filter_stride: analogous to the vertical stride above, but in
498 //    the "x dimension".
499 // - vertical_dilation_rate: there will be (vertical_dilation_rate - 1) skipped
500 //   cells between each filter element in the "y dimension".
501 // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1)
502 //   skipped cells between each filter element in the "x dimension".
503 // - convolution_not_crosscor: By default (convolution_not_crosscor == false),
504 //   we perform cross correlation rather than convolution. With the flag set,
505 //   we perform convolution. Convolution and cross correlation are related by
506 //   rotating the filter by 180 degrees (or equivalently flipping all spatial
507 //   dimensions).
508 class ConvolutionDescriptor {
509  public:
510   // By default construction, there is no zero-padding and the filter stride is
511   // 1x1 (centering the filter on every cell in the input layer's
512   // width-by-height area).
513   ConvolutionDescriptor();
514   explicit ConvolutionDescriptor(int ndims);
515   ~ConvolutionDescriptor();
516 
517   string ToString() const;
518   string ToShortString() const;
519   ConvolutionDescriptorProto ToProto() const { return proto_; }
520 
521   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
522     SetDim(padding(), DimIndex::Y, value);
523     return *this;
524   }
525   ConvolutionDescriptor& set_zero_padding_width(int64 value) {
526     SetDim(padding(), DimIndex::X, value);
527     return *this;
528   }
529   ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) {
530     SetDim(padding(), dim, value);
531     return *this;
532   }
533   ConvolutionDescriptor& set_vertical_filter_stride(int64 value) {
534     SetDim(strides(), DimIndex::Y, value);
535     return *this;
536   }
537   ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) {
538     SetDim(strides(), DimIndex::X, value);
539     return *this;
540   }
541   ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) {
542     SetDim(strides(), dim, value);
543     return *this;
544   }
545   ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) {
546     SetDim(dilations(), DimIndex::Y, value);
547     return *this;
548   }
549   ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) {
550     SetDim(dilations(), DimIndex::X, value);
551     return *this;
552   }
553   ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) {
554     SetDim(dilations(), dim, value);
555     return *this;
556   }
557   ConvolutionDescriptor& set_group_count(int group_count) {
558     proto_.set_group_count(group_count);
559     return *this;
560   }
561   ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) {
562     proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION
563                                      : ConvolutionMode::CROSS_CORRELATION);
564     return *this;
565   }
566   int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); }
567   int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); }
568   int64 vertical_filter_stride() const {
569     return GetDim(strides(), DimIndex::Y);
570   }
571   int64 horizontal_filter_stride() const {
572     return GetDim(strides(), DimIndex::X);
573   }
574   int64 vertical_dilation_rate() const {
575     return GetDim(dilations(), DimIndex::Y);
576   }
577   int64 horizontal_dilation_rate() const {
578     return GetDim(dilations(), DimIndex::X);
579   }
580 
581   int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); }
582   int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); }
583   int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); }
584   // TODO(timshen): remove this function. No users of this class is setting a
585   // non-default pad alignment.
586   PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
587   int group_count() const { return proto_.group_count(); }
588   int ndims() const { return padding().size(); }
589   bool convolution_not_crosscorr() const {
590     return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION;
591   }
592 
593   absl::Span<const int64> strides() const {
594     return AsInt64Slice(proto_.strides());
595   }
596 
597   absl::Span<const int64> dilations() const {
598     return AsInt64Slice(proto_.dilations());
599   }
600 
601   absl::Span<const int64> padding() const {
602     return AsInt64Slice(proto_.paddings());
603   }
604 
605  private:
606   absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); }
607 
608   absl::Span<int64> dilations() {
609     return AsInt64Slice(proto_.mutable_dilations());
610   }
611 
612   absl::Span<int64> padding() {
613     return AsInt64Slice(proto_.mutable_paddings());
614   }
615 
616   ConvolutionDescriptorProto proto_;
617 
618   // TODO(leary) cudnn provides these fields, but need to characterize what
619   // their effect is -- they may be boolean rather than integral.
620   // int64 upscale_input_x;
621   // int64 upscale_input_y;
622 };
623 
624 // A patch of values in the input can be pooled via either a max or an average
625 // operation.
626 // Specify int64 so there's no padding in PoolingDescriptor.
627 enum class PoolingMode : int64 {
628   kMaximum,
629   kAverage,
630 };
631 
632 // Specify the dimension in which to concatenate inputs in space.
633 // Specify int64 so there's no padding in SpaceConcatenateMode.
634 enum class SpaceConcatenateMode : int64 {
635   XDirection,
636   YDirection,
637 };
638 
639 // Returns a short name for the pooling mode, e.g. "Avg".
640 string ShortPoolingModeString(PoolingMode mode);
641 
642 // Describes a pooling operation to be enqueued onto a stream via a platform's
643 // DnnSupport.
644 //
645 // TODO(broune): describe how padding works and what happens if the
646 // window height/width is not divisible by the vertical/horizontal
647 // stride.
648 //
649 // Arguments:
650 //  pooling_mode: pooling operator to use on the input patch
651 //  window_height: height of input window
652 //  window_width: width of input window
653 //  vertical_stride: vertical delta for center of the input patch
654 //  horizontal_stride: horizontal delta for center of the input patch
655 class PoolingDescriptor {
656  public:
657   PoolingDescriptor();
658   explicit PoolingDescriptor(int ndims);
659 
660   PoolingDescriptor& set_pooling_mode(PoolingMode value) {
661     mode_ = value;
662     return *this;
663   }
664   PoolingDescriptor& set_window_height(int64 value) {
665     SetDim(&window_, DimIndex::Y, value);
666     return *this;
667   }
668   PoolingDescriptor& set_window_width(int64 value) {
669     SetDim(&window_, DimIndex::X, value);
670     return *this;
671   }
672   PoolingDescriptor& set_window(DimIndex dim, int64 value) {
673     SetDim(&window_, dim, value);
674     return *this;
675   }
676   PoolingDescriptor& set_vertical_padding(int64 value) {
677     SetDim(&padding_, DimIndex::Y, value);
678     return *this;
679   }
680   PoolingDescriptor& set_horizontal_padding(int64 value) {
681     SetDim(&padding_, DimIndex::X, value);
682     return *this;
683   }
684   PoolingDescriptor& set_padding(DimIndex dim, int64 value) {
685     SetDim(&padding_, dim, value);
686     return *this;
687   }
688   PoolingDescriptor& set_vertical_stride(int64 value) {
689     SetDim(&strides_, DimIndex::Y, value);
690     return *this;
691   }
692   PoolingDescriptor& set_horizontal_stride(int64 value) {
693     SetDim(&strides_, DimIndex::X, value);
694     return *this;
695   }
696   PoolingDescriptor& set_stride(DimIndex dim, int64 value) {
697     SetDim(&strides_, dim, value);
698     return *this;
699   }
700   PoolingDescriptor& set_propagate_nans(bool value) {
701     propagate_nans_ = value;
702     return *this;
703   }
704 
705   int ndims() const { return ndims_; }
706   void CloneFrom(const PoolingDescriptor& other);
707 
708   string ToString() const;
709   string ToShortString() const;
710 
711   PoolingMode mode() const { return mode_; }
712   int64 window_height() const { return GetDim(window_, DimIndex::Y); }
713   int64 window_width() const { return GetDim(window_, DimIndex::X); }
714   int64 window(DimIndex dim) const { return GetDim(window_, dim); }
715   int64 vertical_padding() const { return GetDim(padding_, DimIndex::Y); }
716   int64 horizontal_padding() const { return GetDim(padding_, DimIndex::X); }
717   int64 padding(DimIndex dim) const { return GetDim(padding_, dim); }
718   int64 vertical_stride() const { return GetDim(strides_, DimIndex::Y); }
719   int64 horizontal_stride() const { return GetDim(strides_, DimIndex::X); }
720   int64 stride(DimIndex dim) const { return GetDim(strides_, dim); }
721   absl::Span<const int64> window() const { return window_; }
722   absl::Span<const int64> padding() const { return padding_; }
723   absl::Span<const int64> strides() const { return strides_; }
724   bool propagate_nans() const { return propagate_nans_; }
725 
726  private:
727   PoolingMode mode_;
728   int ndims_;
729   bool propagate_nans_;
730 
731   // Stored as: ..., y, x.
732   std::vector<int64> window_;
733   std::vector<int64> padding_;
734   std::vector<int64> strides_;
735 };
736 
737 // Collects parameters for DNN algorithms
738 class AlgorithmDesc {
739  public:
740   typedef int64 Index;
741   AlgorithmDesc() : AlgorithmDesc(0, false) {}
742   AlgorithmDesc(Index a, bool use_tensor_ops) {
743     proto_.set_algo_id(a);
744     proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
745                                         : AlgorithmProto::DEFAULT_MATH);
746   }
747   bool tensor_ops_enabled() const {
748     return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH;
749   }
750   Index algo_id() const { return proto_.algo_id(); }
751   bool operator==(const AlgorithmDesc& other) const {
752     return algo_id() == other.algo_id() &&
753            tensor_ops_enabled() == other.tensor_ops_enabled();
754   }
755   uint64 hash() const;
756 
757   AlgorithmProto ToProto() const { return proto_; }
758 
759  private:
760   AlgorithmProto proto_;
761 };
762 
763 // Describes the result from a perf experiment.
764 //
765 // Arguments:
766 //  algorithm: returns the exact algorithm that was used.
767 //  elapsed_time_in_ms: returns the measured elapsed time in milliseconds.
768 class ProfileResult {
769  public:
770   bool is_valid() const {
771     return algorithm_.has_value() &&
772            elapsed_time_in_ms() != std::numeric_limits<float>::max();
773   }
774 
775   AlgorithmDesc algorithm() const { return *algorithm_; }
776   void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
777 
778   float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
779   void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
780 
781   size_t scratch_size() const { return scratch_size_; }
782   void set_scratch_size(size_t val) { scratch_size_ = val; }
783 
784  private:
785   absl::optional<AlgorithmDesc> algorithm_;
786   float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
787   // The scratch size algorithm_ requires. Currently it's only populated by
788   // convolutions.
789   size_t scratch_size_ = 0;
790 };
791 
792 // Describes the configuration for the algorithms that will used.
793 //
794 // Arguments:
795 //  algorithm: the primary algorithm that should be used.
796 //  algorithm_no_scratch: a secondary algorithm that should be used, if the
797 //    the allocation for the scratch memory fails.
798 class AlgorithmConfig {
799  public:
800   AlgorithmConfig() {}
801   explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {}
802   AlgorithmConfig(AlgorithmDesc algorithm, AlgorithmDesc algorithm_no_scratch)
803       : algorithm_(algorithm), algorithm_no_scratch_(algorithm_no_scratch) {}
804   absl::optional<AlgorithmDesc> algorithm() const { return algorithm_; }
805   void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
806   absl::optional<AlgorithmDesc> algorithm_no_scratch() const {
807     return algorithm_no_scratch_;
808   }
809   void set_algorithm_no_scratch(AlgorithmDesc val) {
810     algorithm_no_scratch_ = val;
811   }
812   bool operator==(const AlgorithmConfig& other) const {
813     return this->algorithm_ == other.algorithm_ &&
814            this->algorithm_no_scratch_ == other.algorithm_no_scratch_;
815   }
816   bool operator!=(const AlgorithmConfig& other) const {
817     return !(*this == other);
818   }
819   string ToString() const;
820 
821  private:
822   absl::optional<AlgorithmDesc> algorithm_;
823   absl::optional<AlgorithmDesc> algorithm_no_scratch_;
824 };
825 
826 // Describes a local response normalization (LRN). LRN is used e.g. in
827 // dist_belief.
828 //
829 // Let V be the vector of feature maps at some (batch, y, x)
830 // coordinate. LRN applies independently to each vector V in the
831 // input, across all coordinates (batch, y, x), by mapping each V to
832 // another vector U of the same size using the formula
833 //
834 //   U_i = V_i / ((bias + alpha * (sum_j V_j^2)) ^ beta)
835 //
836 // where the sum is taken over j in the closed range [i - range, i + range].
837 //
838 // When calculating U_i the j in the sum can extend beyond the bounds
839 // of V. If wrap_around is true, then V_j = V_{j mod F} where F is the
840 // size of V, which is the number of feature maps. If wrap_around is
841 // false, then V_j = 0 for j outside [0, F-1].
842 //
843 // If segment_size <= F, where F is the number of feature_maps, then
844 // segment_size has no effect. Otherwise, each consecutive segment of
845 // segment_size entries in V are normalized separately.
846 //
847 // Not all StreamExecutors allow wrap_around == true or segment_size
848 // != 64. Some do not implement normalization at all.
849 class NormalizeDescriptor {
850  public:
851   NormalizeDescriptor();
852 
853   NormalizeDescriptor& set_bias(float bias) {
854     bias_ = bias;
855     return *this;
856   }
857 
858   NormalizeDescriptor& set_range(int32 range) {
859     range_ = range;
860     return *this;
861   }
862 
863   NormalizeDescriptor& set_alpha(float alpha) {
864     alpha_ = alpha;
865     return *this;
866   }
867 
868   NormalizeDescriptor& set_beta(float beta) {
869     beta_ = beta;
870     return *this;
871   }
872 
873   NormalizeDescriptor& set_wrap_around(bool wrap_around) {
874     wrap_around_ = wrap_around;
875     return *this;
876   }
877 
878   NormalizeDescriptor& set_segment_size(int32 segment_size) {
879     segment_size_ = segment_size;
880     return *this;
881   }
882 
883   void CloneFrom(const NormalizeDescriptor& other);
884 
885   string ToString() const;
886   string ToShortString() const;
887 
888   float bias() const { return bias_; }
889   int32 range() const { return range_; }
890   float alpha() const { return alpha_; }
891   float beta() const { return beta_; }
892   bool wrap_around() const { return wrap_around_; }
893   int32 segment_size() const { return segment_size_; }
894 
895  private:
896   float bias_;
897   int32 range_;
898   float alpha_;
899   float beta_;
900   bool wrap_around_;
901   int32 segment_size_;
902 };
903 
904 // Returns a string representation of the given activation mode.
905 string ActivationModeString(ActivationMode mode);
906 
907 // Describes the operation that DoElementwiseOperation should perform on its
908 // inputs.
909 enum class ElementwiseOperation { kAdd, kMultiply };
910 
911 string ElementwiseOperationString(ElementwiseOperation op);
912 
913 // A simple class representing the version of the backing library, to
914 // workaround the "too perfect forwarding" issue in gcc6+ compilers.
915 // See PR#16309 and issue #18402 for links discussing the issue.
916 class VersionInfo {
917  public:
918   VersionInfo(int major = 0, int minor = 0, int patch = 0)
919       : major_(major), minor_(minor), patch_(patch) {}
920   int major_version() const { return major_; }
921   int minor_version() const { return minor_; }
922   int patch() const { return patch_; }
923 
924  private:
925   int major_;
926   int minor_;
927   int patch_;
928 };
929 
930 // Suite of operations typically used for implementing Deep/Convolutional Neural
931 // Nets. Note: A false return value of an operation indicates the
932 // implementation is not available.
933 //
934 // TODO(b/118763918): this class (or rather dispatch table) has several
935 // problems:
936 // * Some overloads are missing. Ideally we want to have template virtual
937 //   functions while the template arguments is a closed set. However, we don't
938 //   get that from the language.
939 // * The API is a union of cuDNN and another private backend. Only 10% of the
940 //   functions are actually implemented by both backends, the rest are
941 //   actually backend-specific. The massive interface creates extra mental
942 //   burden.
943 // * Poor error handling: the API should return Status objects.
944 //
945 // PrepareForConvolution is an example for how new APIs should be written.
946 class DnnSupport {
947  public:
948   DnnSupport() {}
949   virtual ~DnnSupport() {}
950 
951   virtual port::Status Init() = 0;
952 
953   // Gets the version of the backing library, as a VersionInfo object.
954   virtual port::StatusOr<VersionInfo> GetVersion() {
955     return port::UnimplementedError(
956         "DnnSupport::GetVersion not implemented on this platform.");
957   }
958 
959   // Performs a single-precision forward batch normalization operation onto
960   // the stream.
961   //
962   // Arguments:
963   //  stream: borrowed pointer to the stream that the batch normalization
964   //    operation should be enqueued onto.
965   //  x: input data.
966   //  scale: scaling parameters.
967   //  offset: offset parameters.
968   //  estimated_mean: population mean estimated during training.
969   //    Used for inference only; empty for training.
970   //  estimated_variance: population variance estimated during training,
971   //    used for inference only; empty for training.
972   //  x_desc: dimensions of the input data, which is the same as the dimensions
973   //    of the output.
974   //  scale_offset_desc: dimensions of scale and offset.
975   //  epsilon: a small floating point number added to the variance of x.
976   //  y: output data.
977   //  batch_mean: batch mean, to be used to compute the running mean.
978   //  batch_variance: batch variance, to be used to compute
979   //    the running variance.
980   //  reserve_space_1: saved mean, to be reused in the backward gradient
981   //    computation.
982   //  reserve_space_2: saved inv_var (1/sqrt(epsilon + variance), to be reused
983   //    in the backward gradient computation.
984   //  is_training: Set to true for training, false for inference.
985   //  var_to_inv_var: a function to convert the variance to inverted variance
986   //    for cuDNN v4 forward inference.
987   //  inv_var_to_var: a function to convert the inverted variance to
988   //    variance for cuDNN v4 forward training, to be used for TensorFlow
989   //    to calculate the running variance.
990   virtual bool DoBatchNormalizationForward(
991       Stream* stream, const DeviceMemory<float>& x,
992       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
993       const DeviceMemory<float>& estimated_mean,
994       const DeviceMemory<float>& estimated_variance,
995       const dnn::BatchDescriptor& x_desc,
996       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
997       DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
998       DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
999       DeviceMemory<float>* reserve_space_2, bool is_training,
1000       std::function<const DeviceMemory<float>&()> var_to_inv_var,
1001       std::function<void()> inv_var_to_var) {
1002     return false;
1003   }
1004 
1005   // Performs a half-precision forwards batch normalization operation onto the
1006   // stream. See DoBatchNormalizationForward above for argument details.
1007   virtual bool DoBatchNormalizationForward(
1008       Stream* stream, const DeviceMemory<Eigen::half>& x,
1009       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
1010       const DeviceMemory<float>& estimated_mean,
1011       const DeviceMemory<float>& estimated_variance,
1012       const dnn::BatchDescriptor& x_desc,
1013       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1014       DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
1015       DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
1016       DeviceMemory<float>* reserve_space_2, bool is_training,
1017       std::function<const DeviceMemory<float>&()> var_to_inv_var,
1018       std::function<void()> inv_var_to_var) {
1019     return false;
1020   }
1021 
1022   // Performs a single-precision backward batch normalization gradient
1023   // computation operation onto the stream.
1024   //
1025   // Arguments:
1026   //  stream: borrowed pointer to the stream that the batch normalization
1027   //    gradient computation operation should be enqueued onto.
1028   //  y_backprop: gradient with regard to output y.
1029   //  x: input data.
1030   //  scale: scaling parameters.
1031   //  inv_var: 1/sqrt(epsilon + variance) of x.
1032   //  x_desc: dimensions of the input data, which is the same as the dimensions
1033   //    of the output.
1034   //  scale_offset_desc: dimensions of scale and offset.
1035   //  epsilon: a small floating point number added to the variance of x.
1036   //  x_backprop: gradient with respect to input x.
1037   //  scale_backprop: gradient with respect to scale.
1038   //  offset_backprop: gradient with respect to offset.
1039   virtual bool DoBatchNormalizationBackward(
1040       Stream* stream, const DeviceMemory<float>& y_backprop,
1041       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
1042       const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
1043       const dnn::BatchDescriptor& x_desc,
1044       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1045       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
1046       DeviceMemory<float>* offset_backprop) {
1047     return false;
1048   }
1049 
1050   // Performs a half-precision backward batch normalization gradient computation
1051   // operation onto the stream. See DoBatchNormalizationBackward above for
1052   // argument details.
1053   virtual bool DoBatchNormalizationBackward(
1054       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
1055       const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
1056       const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
1057       const dnn::BatchDescriptor& x_desc,
1058       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1059       DeviceMemory<Eigen::half>* x_backprop,
1060       DeviceMemory<float>* scale_backprop,
1061       DeviceMemory<float>* offset_backprop) {
1062     return false;
1063   }
1064 
1065   // Enqueues a fused convolution operation onto the stream.
1066   // We provide several variants with different types for inputs, biases and
1067   // scaling parameters.
1068   //
1069   // Arguments (all borrowed):
1070   //  stream: borrowed pointer to the stream that the 'convolve' operation
1071   //    should be enqueued onto.
1072   //  conv_input_descriptor: dimensions of the convolution input layer.
1073   //  conv_input_data: un-owned device memory region which contains the
1074   //    convolution input.
1075   //  conv_input_scale: a floating point scale to multiply with each element
1076   //    of conv_input_data.
1077   //  filter_descriptor: dimensions of the convolution filter.
1078   //  filter_data: un-owned device memory region which contains the
1079   //    convolution filter weights.
1080   //  convolution_descriptor: stride of the convolution filter.
1081   //  biases: un-owned device memory region containing biases to add to the
1082   //    input.
1083   //  activation_mode: Type of activation to perform.
1084   //  side_input_data: un-owned device memory region which contains optional
1085   //    side input data. If 'side_input_scale' is non-zero, then this must
1086   //    point to data in the tensor shape specified by output_shape.
1087   //    It will be scaled by 'side_input_scale' and added to the convolution
1088   //    result and bias prior to applying the activation function.
1089   //  side_input_scale: a floating point scale to multiply with each element
1090   //    of side_input_data.
1091   //  output_descriptor: dimensions of the output layer.
1092   //  output_data: un-owned device memory region in which to place the
1093   //    convolution result.
1094   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
1095   //    space in order to speed up the convolution operation.
1096   //  algorithm_config: specifies which algorithm should be used for the
1097   //    operation.
1098   //  output_profile_result: the output profile result for this call. The
1099   //    profiling is only enabled when this is not nullptr.
1100   //
1101   // conv_input_descriptor, filter_descriptor, convolution_descriptor and
1102   // output_descriptor together specify exactly how the convolution is aligned
1103   // with the input data:
1104   //
1105   // * (input dimensions - filter size + 1) / filter stride == output dimensions
1106   //   corresponds to dist_belief padding = VALID, i.e. the input is not padded.
1107   // * input dimensions / filter stride == output dimensions
1108   //   corresponds to dist_belief padding = SAME, i.e. input and output are the
1109   //   same size - this requires padding the input.
1110   // * (input dimensions + filter size - 1) / filter stride == output dimensions
1111   //   corresponds to dist_belief padding = FULL, i.e. the output is sized so
1112   //   that if the inverse of the filter is applied to the output in VALID mode
1113   //   the result is the same size as the input - this requires even more
1114   //   padding of the input.
1115   virtual bool DoFusedConvolve(
1116       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1117       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
1118       const dnn::FilterDescriptor& filter_descriptor,
1119       const DeviceMemory<double>& filter_data,
1120       const dnn::ConvolutionDescriptor& convolution_descriptor,
1121       const DeviceMemory<double>& side_input_data, double side_input_scale,
1122       const dnn::BatchDescriptor& bias_descriptor,
1123       const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
1124       const dnn::BatchDescriptor& output_descriptor,
1125       DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
1126       const dnn::AlgorithmConfig& algorithm_config,
1127       dnn::ProfileResult* output_profile_result) {
1128     return false;
1129   }
1130 
1131   // This is the float version of DoFusedConvolve.
1132   virtual bool DoFusedConvolve(
1133       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1134       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
1135       const dnn::FilterDescriptor& filter_descriptor,
1136       const DeviceMemory<float>& filter_data,
1137       const dnn::ConvolutionDescriptor& convolution_descriptor,
1138       const DeviceMemory<float>& side_input_data, float side_input_scale,
1139       const dnn::BatchDescriptor& bias_descriptor,
1140       const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
1141       const dnn::BatchDescriptor& output_descriptor,
1142       DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
1143       const dnn::AlgorithmConfig& algorithm_config,
1144       dnn::ProfileResult* output_profile_result) {
1145     return false;
1146   }
1147 
1148   // This is the Eigen::half version of DoFusedConvolve.
1149   // The scaling parameters are still floats.
1150   virtual bool DoFusedConvolve(
1151       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1152       const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
1153       const dnn::FilterDescriptor& filter_descriptor,
1154       const DeviceMemory<Eigen::half>& filter_data,
1155       const dnn::ConvolutionDescriptor& convolution_descriptor,
1156       const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
1157       const dnn::BatchDescriptor& bias_descriptor,
1158       const DeviceMemory<Eigen::half>& biases,
1159       dnn::ActivationMode activation_mode,
1160       const dnn::BatchDescriptor& output_descriptor,
1161       DeviceMemory<Eigen::half>* output_data,
1162       ScratchAllocator* scratch_allocator,
1163       const dnn::AlgorithmConfig& algorithm_config,
1164       dnn::ProfileResult* output_profile_result) {
1165     return false;
1166   }
1167 
1168   // This is the int8 version of DoFusedConvolve.
1169   // The bias input and scaling parameters are floats.
1170   virtual bool DoFusedConvolve(
1171       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1172       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
1173       const dnn::FilterDescriptor& filter_descriptor,
1174       const DeviceMemory<int8>& filter_data,
1175       const dnn::ConvolutionDescriptor& convolution_descriptor,
1176       const DeviceMemory<int8>& side_input_data, float side_input_scale,
1177       const dnn::BatchDescriptor& bias_descriptor,
1178       const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
1179       const dnn::BatchDescriptor& output_descriptor,
1180       DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
1181       const dnn::AlgorithmConfig& algorithm_config,
1182       dnn::ProfileResult* output_profile_result) {
1183     return false;
1184   }
1185 
1186   template <typename ElementType>
1187   port::Status PrepareForConvolution(
1188       ConvolutionKind kind, Stream* stream,
1189       const BatchDescriptor& batch_descriptor,
1190       DeviceMemory<ElementType> input_data,
1191       const FilterDescriptor& filter_descriptor,
1192       DeviceMemory<ElementType> filter_data,
1193       const BatchDescriptor& output_descriptor,
1194       DeviceMemory<ElementType> output_data,
1195       const ConvolutionDescriptor& convolution_descriptor,
1196       const AlgorithmConfig& algorithm_config,
1197       ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
1198       DeviceMemory<uint8>* scratch_memory) {
1199     return DoPrepareForConvolution(
1200         kind, ToDataType<ElementType>::value, stream, batch_descriptor,
1201         input_data, filter_descriptor, filter_data, output_descriptor,
1202         output_data, convolution_descriptor, algorithm_config,
1203         scratch_allocator, algorithm_desc, scratch_memory);
1204   }
1205 
1206   // Enqueues a single-precision convolution operation onto the stream.
1207   //
1208   // Arguments (all borrowed):
1209   //  stream: borrowed pointer to the stream that the 'convolve' operation
1210   //    should be enqueued onto.
1211   //  input_descriptor: dimensions of the input layer.
1212   //  input_data: un-owned device memory region which contains the
1213   //    convolution input.
1214   //  filter_descriptor: dimensions of the convolution filter.
1215   //  convolution_descriptor: stride of the convolution filter.
1216   //  output_descriptor: dimensions of the output layer.
1217   //  output_data: un-owned device memory region in which to place the
1218   //    convolution result.
1219   //  algorithm_desc: specifies which algorithm should be used for the
1220   //    operation.
1221   //  scratch: un-owned device memory for scratch space in order to speed up
1222   //    the convolution operation.
1223   //  output_profile_result: the output profile result for this call. The
1224   //    profiling is only enabled when this is not nullptr.
1225   //
1226   // input_descriptor, filter_descriptor, convolution_descriptor and
1227   // output_descriptor together specify exactly how the convolution is aligned
1228   // with the input data:
1229   //
1230   // * (input dimensions - filter size + 1) / filter stride == output dimensions
1231   //   corresponds to dist_belief padding = VALID, i.e. the input is not padded.
1232   // * input dimensions / filter stride == output dimensions
1233   //   corresponds to dist_belief padding = SAME, i.e. input and output are the
1234   //   same size - this requires padding the input.
1235   // * (input dimensions + filter size - 1) / filter stride == output dimensions
1236   //   corresponds to dist_belief padding = FULL, i.e. the output is sized so
1237   //   that if the inverse of the filter is applied to the output in VALID mode
1238   //   the result is the same size as the input - this requires even more
1239   //   padding of the input.
1240   virtual port::Status DoConvolve(
1241       ConvolutionKind kind, DataType element_type, Stream* stream,
1242       const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
1243       const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
1244       const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
1245       const ConvolutionDescriptor& convolution_descriptor,
1246       AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
1247       ProfileResult* output_profile_result) = 0;
1248 
1249   template <typename ElementType>
1250   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1251                   const DeviceMemory<ElementType>& input_data,
1252                   const dnn::FilterDescriptor& filter_descriptor,
1253                   const DeviceMemory<ElementType>& filter_data,
1254                   const dnn::ConvolutionDescriptor& convolution_descriptor,
1255                   const dnn::BatchDescriptor& output_descriptor,
1256                   DeviceMemory<ElementType>* output_data,
1257                   const dnn::AlgorithmDesc& algorithm_desc,
1258                   DeviceMemory<uint8>* scratch_memory,
1259                   ProfileResult* output_profile_result) {
1260     return IsStatusOk(
1261         DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value,
1262                    stream, input_descriptor, input_data, filter_descriptor,
1263                    filter_data, output_descriptor, *output_data,
1264                    convolution_descriptor, algorithm_desc, *scratch_memory,
1265                    output_profile_result),
1266         !output_profile_result);
1267   }
1268 
1269   // Return a list of algorithms supported by the forward convolution pass.
1270   // cc_major and cc_minor are the compute capabilities of the device.
1271   virtual bool GetConvolveAlgorithms(
1272       bool with_winograd_nonfused, int cc_major, int cc_minor,
1273       std::vector<AlgorithmDesc>* out_algorithms);
1274 
1275   // Returns a list of supported rnn algorithms.
1276   virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
1277 
1278   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
1279   // coefficient_scales specifies the scaling of each column of coefficients:
1280   // original float coefficient[row * num_columns + column] =
1281   //     quantized coefficient[row * num_columns + column] *
1282   //     coefficient_scales[column].
1283   virtual bool DoConvolveQuantized(
1284       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1285       const DeviceMemory<float>& input_data,
1286       const dnn::FilterDescriptor& filter_descriptor,
1287       const DeviceMemory<int8>& filter_coefficients,
1288       const DeviceMemory<float>& coefficient_scales,
1289       const dnn::ConvolutionDescriptor& convolution_descriptor,
1290       const dnn::BatchDescriptor& output_descriptor,
1291       DeviceMemory<float>* output_data) = 0;
1292 
1293   // Same as DoConvolveQuantized above, but int8 filter coefficients.
1294   virtual bool DoConvolveQuantized(
1295       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1296       const DeviceMemory<float>& input_data,
1297       const dnn::FilterDescriptor& filter_descriptor,
1298       const DeviceMemory<int16>& filter_coefficients,
1299       const DeviceMemory<float>& coefficient_scales,
1300       const dnn::ConvolutionDescriptor& convolution_descriptor,
1301       const dnn::BatchDescriptor& output_descriptor,
1302       DeviceMemory<float>* output_data) = 0;
1303 
1304   // Variation of the above with the weight matrix split into two matrices.
1305   // first_weights: Coefficients of the first matrix.
1306   // second_weights: Coefficients of the second matrix.
1307   // depth_multiplier: specifies the columns of the first matrix and rows
1308   // of the second one - first_weights columns = depth_multiplier,
1309   // second_weights rows = depth_multiplier *
1310   //                       filter_descriptor.input_feature_map_count().
1311   // see go/separable for documentation on separable convolutions.
1312   virtual bool DoSeparableConvolve(
1313       Stream* stream, const BatchDescriptor& input_descriptor,
1314       const DeviceMemory<float>& input_data,
1315       const FilterDescriptor& filter_descriptor, int depth_multiplier,
1316       const DeviceMemory<float>& first_weights,
1317       const DeviceMemory<float>& second_weights,
1318       const ConvolutionDescriptor& convolution_descriptor,
1319       const BatchDescriptor& output_descriptor,
1320       DeviceMemory<float>* output_data) = 0;
1321 
1322   // Enqueues a single-precision backward convolution (for data) operation onto
1323   // the stream.
1324   //
1325   // Arguments:
1326   //  stream: borrowed pointer to the stream that the 'convolve' operation
1327   //    should be enqueued onto.
1328   //  filter_descriptor: dimensions of the convolution filter.
1329   //  filter_data: coefficients for the convolution filter.
1330   //  output_descriptor: dimensions of the output gradients, which is the same
1331   //    as the dimensions of the output.
1332   //  backward_output_data: un-owned device memory region which contains the
1333   //    backprop of the output.
1334   //  convolution_descriptor: stride of the convolution filter.
1335   //  input_descriptor: dimensions of the input layer.
1336   //  backward_input_data: un-owned device memory region in which to place the
1337   //    backprop of the input.
1338   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
1339   //    space in order to speed up the convolution operation.
1340   template <typename ElementType>
1341   bool DoConvolveBackwardData(
1342       Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
1343       const DeviceMemory<ElementType>& filter_data,
1344       const dnn::BatchDescriptor& output_descriptor,
1345       const DeviceMemory<ElementType>& backward_output_data,
1346       const dnn::ConvolutionDescriptor& convolution_descriptor,
1347       const dnn::BatchDescriptor& input_descriptor,
1348       DeviceMemory<ElementType>* backward_input_data,
1349       const dnn::AlgorithmDesc& algorithm_desc,
1350       DeviceMemory<uint8>* scratch_memory,
1351       ProfileResult* output_profile_result) {
1352     return IsStatusOk(
1353         DoConvolve(ConvolutionKind::BACKWARD_DATA,
1354                    ToDataType<ElementType>::value, stream, input_descriptor,
1355                    *backward_input_data, filter_descriptor, filter_data,
1356                    output_descriptor, backward_output_data,
1357                    convolution_descriptor, algorithm_desc, *scratch_memory,
1358                    output_profile_result),
1359         !output_profile_result);
1360   }
1361 
1362   // Return a list of algorithms supported by the backward convolution pass for
1363   // data.
1364   virtual bool GetConvolveBackwardDataAlgorithms(
1365       bool with_winograd_nonfused, int cc_major, int cc_minor,
1366       std::vector<AlgorithmDesc>* out_algorithms);
1367 
1368   // Enqueues a single-precision backward convolution (for filter) operation
1369   // onto the stream.
1370   //
1371   // Arguments:
1372   //  stream: borrowed pointer to the stream that the 'convolve' operation
1373   //    should be enqueued onto.
1374   //  input_descriptor: dimensions of the input layer.
1375   //  input_data: un-owned device memory region which contains the
1376   //    convolution input.
1377   //  output_descriptor: dimensions of the output gradients, which is the same
1378   //    as the dimensions of the output.
1379   //  backward_output_data: un-owned device memory region which contains the
1380   //    backprop of the output.
1381   //  convolution_descriptor: stride of the convolution filter.
1382   //  filter_descriptor: dimensions of the convolution filter.
1383   //  backward_filter_data: un-owned device memory region in which to place the
1384   //    backprop of the filter.
1385   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
1386   //    space in order to speed up the convolution operation.
1387   template <typename ElementType>
1388   bool DoConvolveBackwardFilter(
1389       Stream* stream, const BatchDescriptor& input_descriptor,
1390       const DeviceMemory<ElementType>& input_data,
1391       const BatchDescriptor& output_descriptor,
1392       const DeviceMemory<ElementType>& backward_output_data,
1393       const ConvolutionDescriptor& convolution_descriptor,
1394       const FilterDescriptor& filter_descriptor,
1395       DeviceMemory<ElementType>* backward_filter_data,
1396       const dnn::AlgorithmDesc& algorithm_desc,
1397       DeviceMemory<uint8>* scratch_memory,
1398       ProfileResult* output_profile_result) {
1399     return IsStatusOk(
1400         DoConvolve(ConvolutionKind::BACKWARD_FILTER,
1401                    ToDataType<ElementType>::value, stream, input_descriptor,
1402                    input_data, filter_descriptor, *backward_filter_data,
1403                    output_descriptor, backward_output_data,
1404                    convolution_descriptor, algorithm_desc, *scratch_memory,
1405                    output_profile_result),
1406         !output_profile_result);
1407   }
1408 
1409   // Return a list of algorithms supported by the backward convolution pass for
1410   // filters.
1411   virtual bool GetConvolveBackwardFilterAlgorithms(
1412       bool with_winograd_nonfused, int cc_major, int cc_minor,
1413       std::vector<AlgorithmDesc>* out_algorithms);
1414 
1415   // Enqueues a single-precision backward convolution (for bias) operation onto
1416   // the stream.
1417   //
1418   // Arguments:
1419   //  stream: borrowed pointer to the stream that the 'convolve' operation
1420   //    should be enqueued onto.
1421   //  input_descriptor: dimensions of the input layer.
1422   //  input_data: un-owned device memory region which contains the
1423   //    convolution input.
1424   //  bias_descriptor: dimensions of the bias tensor. Should be the same as the
1425   //    input dimensions, but with the spatial dimensions set to 1.
1426   //  backward_filter_data: un-owned device memory region in which to place the
1427   //    backprop of the bias.
1428   virtual bool DoConvolveBackwardBias(Stream* stream,
1429                                       const BatchDescriptor& input_descriptor,
1430                                       const DeviceMemory<float>& input_data,
1431                                       const BatchDescriptor& bias_descriptor,
1432                                       DeviceMemory<float>* backward_bias_data) {
1433     return false;
1434   }
1435 
1436   virtual bool DoConvolveBackwardBias(
1437       Stream* stream, const BatchDescriptor& input_descriptor,
1438       const DeviceMemory<double>& input_data,
1439       const BatchDescriptor& bias_descriptor,
1440       DeviceMemory<double>* backward_bias_data) {
1441     return false;
1442   }
1443 
1444   virtual bool DoConvolveBackwardBias(
1445       Stream* stream, const BatchDescriptor& input_descriptor,
1446       const DeviceMemory<Eigen::half>& input_data,
1447       const BatchDescriptor& bias_descriptor,
1448       DeviceMemory<Eigen::half>* backward_bias_data) {
1449     return false;
1450   }
1451 
1452   // Fully connects the "nodes" (float values) in input_data with
1453   // shape input_dimensions to output_data with output_dimensions
1454   // using provided weights. This is equivalent to computing a matrix
1455   // product, hence the name MatMul.
1456   //
1457   // A BatchDescriptor has four dimensions: batch, y, x, depth. Matrix products
1458   // happen in two dimensions. To get down to two dimensions, we consider the
1459   // input y, x and depth dimension as one combined dimension T. For now,
1460   // assume that the output height and width are 1 and let OD be the output
1461   // depth.
1462   //
1463   // There are three device memory buffers passed in to this
1464   // function. We can now view all three as matrices:
1465   //
1466   //   input_data: A batch x T matrix
1467   //   weights: A T x OD matrix
1468   //   output_data: A batch x OD matrix
1469   //
1470   // This function then computes the matrix product of input_data and
1471   // weights and writes the result into output_data.
1472   //
1473   // Here the weights buffer is in row major order, i.e. the first OD
1474   // entries in weights are the first row, the second OD entries in
1475   // weights are the second row and so on.
1476   //
1477   // The case for output width*height > 1 is more complicated. Let K =
1478   // OY * OX where OY is the output height and OX is the output
1479   // width. Then weights is divided into K sub-arrays W_i, for
1480   // i=0,...,k-1, that each represent a T x OD matrix. This function
1481   // then computes the K matrix multiplications of input_data with
1482   // each W_i. This creates K matrices with dimensions batch x
1483   // OD. These K matrices are concatenated horizontally to form one
1484   // larger matrix with dimensions batch x (K*OD); note that this is
1485   // not the same as concatenating the bytes of the matrices. The
1486   // combined matrix can then be interpreted as a tensor with
1487   // dimensions (batch, OY, OX, OD). If the output tensor format is
1488   // not kBatchYXDepth, this function would then need to arrange for
1489   // the output to be in the requested layout, if that is
1490   // supported. Note that the case K=1 is equivalent to the
1491   // description above. It is recommended to prefer the case K=1.
1492   //
1493   // Arguments (all borrowed):
1494   //  stream: borrowed pointer to the stream that the 'fully connect' operation
1495   //    should be enqueued onto.
1496   //  output_data: un-owned device memory region in which to place the
1497   //    fully connected result.
1498   virtual bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data,
1499                         const DeviceMemory<float>& weights,
1500                         const dnn::BatchDescriptor& input_dimensions,
1501                         const dnn::BatchDescriptor& output_dimensions,
1502                         DeviceMemory<float>* output_data) = 0;
1503 
1504   // Version of DoMatMul that uses pre-quantized 8 bit weights.
1505   // weight_scales specifies the scaling of each column of weights:
1506   // original float weight[row * num_columns + column] =
1507   //     quantized_weight[row * nnum_columns + column] * weight_scales[column].
1508   virtual bool DoMatMulQuantized(Stream* stream,
1509                                  const DeviceMemory<float>& input_data,
1510                                  const DeviceMemory<int8>& quantized_weights,
1511                                  const DeviceMemory<float>& weight_scales,
1512                                  const dnn::BatchDescriptor& input_dimensions,
1513                                  const dnn::BatchDescriptor& output_dimensions,
1514                                  DeviceMemory<float>* output_data) = 0;
1515 
1516   // Version of DoMatMul that uses pre-quantized 16 bit weights.
1517   // weight_scales specifies the scaling of each column of weights:
1518   // original float weight[row * num_columns + column] =
1519   //     quantized_weight[row * nnum_columns + column] * weight_scales[column].
1520   virtual bool DoMatMulQuantized(Stream* stream,
1521                                  const DeviceMemory<float>& input_data,
1522                                  const DeviceMemory<int16>& quantized_weights,
1523                                  const DeviceMemory<float>& weight_scales,
1524                                  const dnn::BatchDescriptor& input_dimensions,
1525                                  const dnn::BatchDescriptor& output_dimensions,
1526                                  DeviceMemory<float>* output_data) = 0;
1527 
1528   // Adds biases to the feature maps in input_data producing
1529   // output_data. input_data can equal output_data, but must not
1530   // partially overlap it.
1531   //
1532   // Let K = count() * height() * width() and N = feature_map_count()
1533   // on dimensions. Then input_value contains K*N values and biases
1534   // contains N values. We can thus logically consider input_value to
1535   // contain K vectors of N elements each. This function adds biases
1536   // to each of those N vectors.
1537   //
1538   // TODO(broune): This works differently when width() * height() > 1
1539   // and the call to ThenBiasAdd() follows a call to ThenMatMul(). In
1540   // that case there should be width() * height() *
1541   // feature_map_count() biases, but this is not implemented on all
1542   // StreamExecutors.
1543   //
1544   // Arguments (all borrowed):
1545   //  stream: borrowed pointer to the stream that the 'bias add' operation
1546   //    should be enqueued onto.
1547   //  input_data: un-owned device memory region containing the input.
1548   //  biases: un-owned device memory region containing biases to add to the
1549   //    input.
1550   //  dimensions: dimensions of input_data and output_data.
1551   //  output_data: un-owned device memory region in which to place the result.
1552   virtual bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data,
1553                          const DeviceMemory<float>& biases,
1554                          const dnn::BatchDescriptor& dimensions,
1555                          DeviceMemory<float>* output_data) = 0;
1556 
1557   // Performs a forward pooling operation on input_data, writing to
1558   // output_data. See PoolingDescriptor for how to configure the
1559   // pooling operation.
1560   //
1561   // Pooling happens as a window that moves across the Y and X
1562   // dimensions of input_data, where each position of the window
1563   // yields one output value. E.g. for max pooling, the computed value
1564   // is the maximum element in the window. The operation is applied
1565   // independently to each batch and at each feature map (depth), so
1566   // that the output depth and feature_map_count are the same as for
1567   // the input. The output width and height can be different.
1568   //
1569   // See PoolingDescriptor for how to configure the pooling operation.
1570   virtual bool DoPoolForward(Stream* stream,
1571                              const dnn::PoolingDescriptor& pooling_dimensions,
1572                              const dnn::BatchDescriptor& input_dimensions,
1573                              const DeviceMemory<float>& input_data,
1574                              const dnn::BatchDescriptor& output_dimensions,
1575                              DeviceMemory<float>* output_data,
1576                              ScratchAllocator* workspace_allocator) = 0;
1577 
1578   virtual bool DoPoolForward(Stream* stream,
1579                              const dnn::PoolingDescriptor& pooling_dimensions,
1580                              const dnn::BatchDescriptor& input_dimensions,
1581                              const DeviceMemory<double>& input_data,
1582                              const dnn::BatchDescriptor& output_dimensions,
1583                              DeviceMemory<double>* output_data,
1584                              ScratchAllocator* workspace_allocator) {
1585     LOG(FATAL) << "DoPoolForward not implemented for double.";
1586     return false;
1587   }
1588 
1589   virtual bool DoPoolForward(Stream* stream,
1590                              const dnn::PoolingDescriptor& pooling_dimensions,
1591                              const dnn::BatchDescriptor& input_dimensions,
1592                              const DeviceMemory<Eigen::half>& input_data,
1593                              const dnn::BatchDescriptor& output_dimensions,
1594                              DeviceMemory<Eigen::half>* output_data,
1595                              ScratchAllocator* workspace_allocator) {
1596     LOG(FATAL) << "DoPoolForward not implemented for float16.";
1597     return false;
1598   }
1599 
1600   virtual bool DoPoolForward(Stream* stream,
1601                              const dnn::PoolingDescriptor& pooling_dimensions,
1602                              const dnn::BatchDescriptor& input_dimensions,
1603                              const DeviceMemory<int8>& input_data,
1604                              const dnn::BatchDescriptor& output_dimensions,
1605                              DeviceMemory<int8>* output_data,
1606                              ScratchAllocator* workspace_allocator) {
1607     LOG(FATAL) << "DoPoolForward not implemented for int8.";
1608     return false;
1609   }
1610 
1611   // Performs differentiation of the pooling operation.
1612   virtual bool DoPoolBackward(Stream* stream,
1613                               const dnn::PoolingDescriptor& pooling_dimensions,
1614                               const dnn::BatchDescriptor& input_dimensions,
1615                               const DeviceMemory<double>& input_data,
1616                               const dnn::BatchDescriptor& output_dimensions,
1617                               const DeviceMemory<double>& output_data,
1618                               const DeviceMemory<double>& input_diff_data,
1619                               DeviceMemory<double>* output_diff_data,
1620                               ScratchAllocator* workspace_allocator) {
1621     LOG(FATAL) << "DoPoolBackward not implemented.";
1622     return false;
1623   }
1624 
1625   virtual bool DoPoolBackward(Stream* stream,
1626                               const dnn::PoolingDescriptor& pooling_dimensions,
1627                               const dnn::BatchDescriptor& input_dimensions,
1628                               const DeviceMemory<float>& input_data,
1629                               const dnn::BatchDescriptor& output_dimensions,
1630                               const DeviceMemory<float>& output_data,
1631                               const DeviceMemory<float>& input_diff_data,
1632                               DeviceMemory<float>* output_diff_data,
1633                               ScratchAllocator* workspace_allocator) {
1634     LOG(FATAL) << "DoPoolBackward not implemented.";
1635     return false;
1636   }
1637 
1638   virtual bool DoPoolBackward(Stream* stream,
1639                               const dnn::PoolingDescriptor& pooling_dimensions,
1640                               const dnn::BatchDescriptor& input_dimensions,
1641                               const DeviceMemory<Eigen::half>& input_data,
1642                               const dnn::BatchDescriptor& output_dimensions,
1643                               const DeviceMemory<Eigen::half>& output_data,
1644                               const DeviceMemory<Eigen::half>& input_diff_data,
1645                               DeviceMemory<Eigen::half>* output_diff_data,
1646                               ScratchAllocator* workspace_allocator) {
1647     LOG(FATAL) << "DoPoolBackward not implemented.";
1648     return false;
1649   }
1650 
1651   // Applies local response normalization to the values from input_data and
1652   // writes the result to output_data.
1653   //
1654   // See comments on NormalizeDescriptor for a description of local response
1655   // normalization.
1656   virtual bool DoNormalizeWithDimensions(
1657       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
1658       const dnn::BatchDescriptor& dimensions,
1659       const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
1660     return false;
1661   }
1662 
1663   // Performs backpropagation for the normalization operation
1664   //
1665   // Given raw data, its corresponding normalized output, and a gradient of some
1666   // unspecified function with respect to the normalized variables, computes the
1667   // gradient of that unspecified function with respect to the raw variables.
1668   //
1669   // The normalized data input array is expected to match the output that would
1670   // be obtained by running the raw data input array through the DoNormalize
1671   // method above.
1672   //
1673   // See comments on NormalizeDescriptor for a description of local response
1674   // normalization.
1675   virtual bool DoNormalizeBackwardWithDimensions(
1676       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
1677       const dnn::BatchDescriptor& dimensions,
1678       const DeviceMemory<float>& raw_data,
1679       const DeviceMemory<float>& normalized_data,
1680       const DeviceMemory<float>& normalized_variable_gradient,
1681       DeviceMemory<float>* raw_variable_gradient,
1682       ScratchAllocator* workspace_allocator) {
1683     return false;
1684   }
1685 
1686   // Applies an activation function (see ActivationMode) to all of the values
1687   // held on the device in 'input_data', whose dimensions are described by
1688   // 'dimensions'.
1689   //
1690   // Arguments (all borrowed):
1691   //  stream: borrowed pointer to the stream that the 'activate' operation
1692   //    should be enqueued onto.
1693   //  activation_mode: Type of activation to perform.
1694   //  input_data: un-owned device memory region which contains the
1695   //    activate input.
1696   //  output_data: un-owned device memory region in which to place the
1697   //    activate result.
1698   virtual bool DoActivate(Stream* stream, ActivationMode activation_mode,
1699                           const BatchDescriptor& dimensions,
1700                           const DeviceMemory<float>& input_data,
1701                           DeviceMemory<float>* output_data, uint64 options) {
1702     return false;
1703   }
1704 
1705   // Concatenates several layers into one, by concatenating the depth of each
1706   // layer at matching x and y coordinates.
1707   // The inputs must all have the same width and height, the output will have
1708   // the same width and height as the inputs and its depth will be the sum of
1709   // the input depths.
1710   //
1711   // Arguments (all borrowed):
1712   //  stream: borrowed pointer to the stream that the 'depth concatenate'
1713   // operation should be enqueued onto.
1714   //  input_dimensions: The dimensions of each input.
1715   //  input_data: un-owned device memory region which contains the
1716   //    input data for each input layer.
1717   //  output_data: un-owned device memory region in which to place the
1718   //    depth concatenate result.
1719   virtual bool DoDepthConcatenate(
1720       Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1721       port::ArraySlice<const DeviceMemory<float>*> input_data,
1722       DeviceMemory<float>* output_data) = 0;
1723 
1724   // Concatenates several layers into one, by concatenating each in the
1725   // x-dimension or y-dimension, based on a user-specified flag.
1726   // For x-concatenation, layers are aligned at matching y and depth
1727   // coordinates, and for y-concatenation, they are aligned at matching x and
1728   // depth coordinates. The inputs must all have the same depth and batch size.
1729   // For x-concatenation, the inputs must have the same height (y-size), and the
1730   // output will have the same depth and height as the inputs and its width (x-
1731   // size) will be the sum of the input widths.  For y-concatenation, the inputs
1732   // must have the same width, and the output will have the same depth and width
1733   // as the inputs, and its height will be the sum of the input heights.
1734   //
1735   // Arguments:
1736   //  stream: borrowed pointer to the stream that the 'space concatenate'
1737   //    operation should be enqueued onto.
1738   //  input_dimensions: the dimensions of each input.
1739   //  input_data: un-owned device memory region which contains the input data
1740   //    for each input layer.
1741   //  output_data: un-owned device memory region in which to place the space
1742   //    concatenate result.
1743   //  concat_direction:  either dnn:SpaceConcatenateMode::XDirection or
1744   //    dnn::SpaceConcatenateMode::YDirection.
1745   virtual bool DoSpaceConcatenate(
1746       Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1747       port::ArraySlice<const DeviceMemory<float>*> input_data,
1748       DeviceMemory<float>* output_data,
1749       dnn::SpaceConcatenateMode concat_direction) {
1750     return false;
1751   }
1752 
1753   // Change the layout of the data by shrinking one dimension (or set of
1754   // dimensions) and growing another dimension (or set of dimensions), while
1755   // keeping the total number of data elements constant, and maintaining the
1756   // current data ordering.
1757   //
1758   // Currently, the only supported operation is depth into space by a power of
1759   // 2. E.g. (y, x, z) -> (y*2, x*2, z/4)
1760   //
1761   // Note that Reshape may not be a no-op, depending on the platform and which
1762   // dimensions are being changed.
1763   //
1764   // Example: forgetting about batch for the moment, let's take a tensor that's
1765   // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout
1766   // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The
1767   // elements of the tensor range from 0 to 15. The x,y,z indices are below each
1768   // element.
1769   //
1770   //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
1771   // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1
1772   // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0
1773   // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7
1774   //
1775   // reshape to 4x2x2
1776   //
1777   //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
1778   // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3
1779   // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1
1780   // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1
1781   virtual bool DoReshape(Stream* stream,
1782                          const dnn::BatchDescriptor& input_dimensions,
1783                          const DeviceMemory<float>& input_data,
1784                          const dnn::BatchDescriptor& output_dimensions,
1785                          DeviceMemory<float>* output_data) {
1786     return false;
1787   }
1788 
1789   // Depth to space takes an X by Y image with depth D*M² and changes it to an
1790   // MX x MY image with depth D. Each input location (x,y) with depth D*M² in
1791   // the input image is changed to an MxM contiguous area in the output image,
1792   // with the values being laid out in the raster order by DepthToSpaceLayout,
1793   // and will have a new depth of D.
1794   //
1795   // Example.
1796   // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4,  Dout=2
1797   // DepthHeightWidth layout
1798   // Values within a 'cell' are at different depths and same x & y.
1799   // Input:
1800   // abcdefgh  ijklmnop
1801   // qrstuvwx  yz012345
1802   // Output:
1803   // ae bf im jn
1804   // cg dh ko lp
1805   // qu rv y2 z3
1806   // sw tx 04 15
1807   //
1808   // sqrt_depth_reduction: 'M' in the comment above
1809   virtual bool DoDepthToSpace(Stream* stream,
1810                               const dnn::BatchDescriptor& input_dimensions,
1811                               const DeviceMemory<float>& input_data,
1812                               const DepthToSpaceLayout& depth_to_space_layout,
1813                               const int& sqrt_depth_reduction,
1814                               DeviceMemory<float>* output_data) {
1815     return false;
1816   }
1817 
1818   // Space to depth is the inverse of depth to space. Space to depth takes each
1819   // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
1820   // the input, and transforms it to a 1 by 1 patch with depth D*M². If the
1821   // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of
1822   // data elements is not changed.
1823   //
1824   // Example.
1825   // M=2, Din =2, Xin=4, Yin=4,  Dout=8
1826   // DepthHeightWidth layout
1827   // Values within a 'cell' are at different depths and same x & y.
1828   // Input:
1829   // ae bf im jn
1830   // cg dh ko lp
1831   // qu rv y2 z3
1832   // sw tx 04 15
1833   // Output:
1834   // abcdefgh  ijklmnop
1835   // qrstuvwx  yz012345
1836   //
1837   // sqrt_depth_increase: 'M' in the comment above
1838   virtual bool DoSpaceToDepth(Stream* stream,
1839                               const dnn::BatchDescriptor& input_dimensions,
1840                               const DeviceMemory<float>& input_data,
1841                               const DepthToSpaceLayout& space_to_depth_layout,
1842                               const int& sqrt_depth_increase,
1843                               DeviceMemory<float>* output_data) {
1844     return false;
1845   }
1846 
1847   // Computes the specified operation (e.g. addition or multiplication)
1848   // between corresponding elements in the inputs and stores the result in the
1849   // output element.
1850   // The inputs and output must all have the same dimensions, but may have
1851   // different quantization parameters (min_value and max_value).
1852   //
1853   // Arguments (all borrowed):
1854   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1855   // should be enqueued onto.
1856   //  operation: The operation to perform.
1857   //  input_dimensions: The dimensions of each input.
1858   //  input_data: un-owned device memory region which contains the
1859   //    input data for each input layer.
1860   //  output_dimensions: The dimensions of the output.
1861   //  output_data: un-owned device memory region in which to place the
1862   //    operation result.
1863   virtual bool DoElementwiseOperate(
1864       Stream* stream, ElementwiseOperation operation,
1865       port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1866       port::ArraySlice<const DeviceMemory<float>*> input_data,
1867       const dnn::BatchDescriptor& output_dimensions,
1868       DeviceMemory<float>* output_data) = 0;
1869 
1870   // Computes the specified operation (e.g. addition or multiplication)
1871   // between corresponding elements in the inputs and stores the result in the
1872   // output element. Each input is multiplied by a scalar constant and the
1873   // result is divided by a scalar constant.
1874   // e.g. To perform Z = 0.9*X + 1.1*Y, set the input multiplicands to 9 and 11
1875   // and the output divisor to 10.
1876   // The inputs and output must all have the same dimensions, but may have
1877   // different quantization parameters (min_value and max_value).
1878   //
1879   // Arguments (all borrowed):
1880   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1881   // should be enqueued onto.
1882   //  operation: The operation to perform.
1883   //  input_multiplicands: Amount to scale each input.
1884   //  output_divisor: Amount to divide the output.
1885   //  input_dimensions: The dimensions of each input.
1886   //  input_data: un-owned device memory region which contains the
1887   //    input data for each input layer.
1888   //  output_dimensions: The dimensions of the output.
1889   //  output_data: un-owned device memory region in which to place the
1890   //    operation result.
1891   virtual bool DoElementwiseOperateScaledQuantized(
1892       Stream* stream, ElementwiseOperation operation,
1893       port::ArraySlice<int> input_multiplicands, int output_divisor,
1894       port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1895       port::ArraySlice<const DeviceMemory<float>*> input_data,
1896       const dnn::BatchDescriptor& output_dimensions,
1897       DeviceMemory<float>* output_data) {
1898     return false;
1899   }
1900 
1901   // Pads the input with zeros in the X and Y dimensions. The feature_map
1902   // dimension is unchanged.
1903   //
1904   // Arguments (all borrowed):
1905   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1906   // should be enqueued onto.
1907   //  dimensions: The dimensions of the input.
1908   //  input_data: un-owned device memory region which contains the
1909   //    input data for the input layer.
1910   //  left_pad: Amount to pad the input on the left.
1911   //  right_pad: Amount to pad the input on the right.
1912   //  top_pad: Amount to pad the input at the top (low Y).
1913   //  bottom_pad: Amount to pad the input at the bottom (high Y).
1914   //  output_data: un-owned device memory region in which to place the
1915   //    padded result.
1916   virtual bool DoXYPad(Stream* stream, const dnn::BatchDescriptor &dimensions,
1917                        const DeviceMemory<float> &input_data,
1918                        int64 left_pad, int64 right_pad, int64 top_pad,
1919                        int64 bottom_pad, DeviceMemory<float> *output_data) = 0;
1920 
1921   // Extracts a slice of the input in the X and Y dimensions. The feature_map
1922   // dimension is unchanged.
1923   //
1924   // Arguments (all borrowed):
1925   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1926   // should be enqueued onto.
1927   //  dimensions: The dimensions of the input.
1928   //  input_data: un-owned device memory region which contains the
1929   //    input data for the input layer.
1930   //  left_trim: Amount to cut off the input on the left.
1931   //  right_trim: Amount to cut off the input on the right.
1932   //  top_trim: Amount to cut off the input at the top (low y).
1933   //  bottom_trim: Amount to cut off the input at the bottom (high Y).
1934   //  output_data: un-owned device memory region in which to place the
1935   //    padded result.
1936   virtual bool DoXYSlice(Stream* stream, const dnn::BatchDescriptor &dimensions,
1937                     const DeviceMemory<float> &input_data,
1938                     int64 left_trim, int64 right_trim, int64 top_trim,
1939                     int64 bottom_trim, DeviceMemory<float> *output_data) = 0;
1940 
1941   // Grows the input tensor by replicating the X and Y dimensions. The batch and
1942   // depth/feature_map dimensions are unchanged. Currently, the input tensor is
1943   // limited to X=1 and Y=1.
1944   //
1945   // For example, the input has dimensions x=2, y=3, and replicate_x=3,
1946   // replicate_y=2. The diagonal elements of the output would be: [x0y0, x1y1,
1947   // x0y2, x1y0, x0y1, x1y2].
1948   // Here is the example as a picture. input:
1949   // AB
1950   // CD
1951   // EF
1952   // broadcast result:
1953   // ABABAB
1954   // CDCDCD
1955   // EFEFEF
1956   // ABABAB
1957   // CDCDCD
1958   // EFEFEF
1959   //
1960   // Arguments (all borrowed):
1961   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1962   // should be enqueued onto.
1963   //  dimensions: The dimensions of the input.
1964   //  input_data: un-owned device memory region which contains the
1965   //    input data for the input layer.
1966   //  replicate_x: Amount to replicate the input's X dimension.
1967   //  replicate_y: Amount to replicate the input's Y dimension.
1968   //  output_data: un-owned device memory region in which to place the
1969   //    padded result.
1970   virtual bool DoXYBroadcast(Stream* stream,
1971                              const dnn::BatchDescriptor& dimensions,
1972                              const DeviceMemory<float>& input_data,
1973                              int64 replicate_x, int64 replicate_y,
1974                              DeviceMemory<float>* output_data) {
1975     return false;
1976   }
1977 
1978   // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that
1979   // is, bytes instead of scaled floats) into 'host_dst' if they are available
1980   // for the underlying DNN implementation. If this quantized output is not
1981   // available, false is returned, which will place 'stream' into an error
1982   // state.
1983   //
1984   // Arguments (all borrowed):
1985   //  stream: borrowed pointer to the stream that the 'quantized memcpy'
1986   //    operation should be enqueued onto.
1987   //  gpu_unquantized_src: the device memory that contains the unquantized data
1988   //    -- this data should also have a corresponding quantized representation
1989   //    on the device for this operation to succeed.
1990   //  mode: Type of quantization of the data to write into host_dst.
1991   //  host_dst: un-owned host memory region that is mutated in place,
1992   //    it is clobbered by the values in 'gpu_unquantized_src' when the enqueued
1993   //    (asynchronous) memcpy operation is performed.
1994   //  size: size in bytes of the host_dst host memory region.
1995   virtual bool DoMemcpyD2HQuantized(
1996       Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
1997       QuantizedActivationMode mode, void* host_dst, int64 size) = 0;
1998 
1999   // Enqueues an asynchronous memcpy of 'host_dst' into the *quantized* input
2000   // of a layer (that is, bytes instead of scaled floats) if they are supported
2001   // by the underlying DNN implementation. If this quantized input is not
2002   // supported, false is returned, which will place 'stream' into an error
2003   // state.
2004   //
2005   // Arguments (all borrowed):
2006   //  stream: borrowed pointer to the stream that the 'quantized memcpy'
2007   //    operation should be enqueued onto.
2008   //  host_src: un-owned host memory region that contains the quantized data.
2009   //  size: size in bytes of the host_src host memory region.
2010   //  mode: Type of quantization of the data to read from host_src.
2011   //  gpu_unquantized_dst: the device memory that is clobbered by the values in
2012   //    'host_src' when the enqueued (asynchronous) memcpy operation is
2013   //    performed. -- this data should also have a corresponding quantized
2014   //    representation on the device for this operation to
2015   //    succeed.
2016   virtual bool DoMemcpyH2DQuantized(
2017       Stream* stream, const void* host_src, int64 size,
2018       QuantizedActivationMode mode,
2019       DeviceMemory<float>* gpu_unquantized_dst) = 0;
2020 
2021   // Create an RNN descriptor based on model shapes and configurations.
2022   // The caller retains the ownership of the descriptor.
2023   //
2024   // Arguments:
2025   //  num_layers: the number of layers for a RNN model.
2026   //  hidden_size: the size of the hidden state.
2027   //  input_size: the size of the input state.
2028   //  input_mode: an enum to specify whether a linear transformation is added
2029   //    after the input state. If input_size is different from hidden_size, this
2030   //    is required.
2031   //  direction_mode: an enum to specify whether this model is unidirectional or
2032   //    bidirectional.
2033   //  rnn_mode: an enum to specify the type of model to build.
2034   //  data_type: an enum to specify the data types used in this model.
2035   //  dropout: the dropout threshold between layers. When it is 0., no dropout
2036   //    is added.
2037   //  seed: a seed for initializing the dropout layers.
2038   //  state_allocator: an memory allocator that will be used to store the state
2039   //    for dropout layer. The user has to maintain the memory until the model
2040   //    is no longer in use.
2041   virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
2042   createRnnDescriptor(int num_layers, int hidden_size, int input_size,
2043                       int batch_size, dnn::RnnInputMode input_mode,
2044                       dnn::RnnDirectionMode direction_mode,
2045                       dnn::RnnMode rnn_mode, dnn::DataType data_type,
2046                       const dnn::AlgorithmConfig& algorithm_config,
2047                       float dropout, uint64 seed,
2048                       ScratchAllocator* state_allocator) {
2049     return port::Status(port::error::UNIMPLEMENTED,
2050                         "createRnnDescriptor is unimplemented");
2051   }
2052 
2053   // Create a RNN sequence descriptor that specifies either the input or output
2054   // sequence. The caller retains the ownership of the returned descriptor.
2055   //
2056   // Arguments:
2057   //  max_seq_length: the max length of the sequences.
2058   //  batch_size: the size of a minibatch.
2059   //  data_size: the size of the state.
2060   //  seq_lenghs: the lengths of sequences in a batch.
2061   //  data_type: an enum to specify the type for the underlying data.
2062   virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
2063   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
2064                                     int data_size, dnn::DataType data_type) {
2065     return port::Status(port::error::UNIMPLEMENTED,
2066                         "createRnnSequenceTensorDescriptor is unimplemented");
2067   }
2068 
2069   virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
2070   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
2071                                     int data_size,
2072                                     const absl::Span<const int>& seq_lengths,
2073                                     bool time_major, dnn::DataType data_type) {
2074     return port::Status(port::error::UNIMPLEMENTED,
2075                         "createRnnSequenceTensorDescriptor is unimplemented");
2076   }
2077 
2078   // Create an RNN state descriptor that specifies the input or hidden state.
2079   // The caller retains the ownership of the returned descriptor.
2080   virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
2081   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
2082                                  dnn::DataType data_type) {
2083     return port::Status(port::error::UNIMPLEMENTED,
2084                         "createRnnStateTensorDescriptor is unimplemented");
2085   }
2086 
2087   // Enqueue a forward operation of the RNN model onto the stream.
2088   //
2089   // Arguments:
2090   //  stream: pointer to the stream where this operation should be enqueued to.
2091   //  rnn_desc: a RNN descriptor created by createRnnDescriptor.
2092   //  input_desc: descriptor for the input sequence.
2093   //  input_data: the device memory region that contains the input data.
2094   //  input_h_desc: descriptor for the input "h" state.
2095   //  input_h_data: the device memory region that contains the input "h" data.
2096   //  input_c_desc: descriptor for the input "c" state.
2097   //  input_c_data: the device memory region that contains the input "c" data.
2098   //    This must be specified for LSTM models.
2099   //  params: the device memory region that contains the parameters used in this
2100   //    model.
2101   //  output_desc: descriptor for the output sequence.
2102   //  output_data: the memory region that stores the output sequence data.
2103   //  output_h_desc: descriptor for the output "h" state.
2104   //  output_h_data: the memory region that stores the output "h" data.
2105   //  output_c_desc: descriptor for the output "c" state.
2106   //  output_c_data: the memory region that stores the output "c" data. This
2107   //    must be specified for LSTM models.
2108   //  is_training: whether this is used in training or inference. That decides
2109   //    whether respace_space data need to be produced.
2110   //  reserve_space_allocator: if "is_training" is true, an memory allocator
2111   //    to create memory that holds the produced reserve_space. The caller is
2112   //  retains the data and feed it to the backward pass.
2113   //  workspace_allocator: an allocator to create temporary workspace used in
2114   //    this kernel. The caller is responsible for retaining the memory long
2115   //    enough for the lifespan of this operation, and recycles afterwards.
2116   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2117                             const dnn::RnnSequenceTensorDescriptor& input_desc,
2118                             const DeviceMemory<Eigen::half>& input_data,
2119                             const dnn::RnnStateTensorDescriptor& input_h_desc,
2120                             const DeviceMemory<Eigen::half>& input_h_data,
2121                             const dnn::RnnStateTensorDescriptor& input_c_desc,
2122                             const DeviceMemory<Eigen::half>& input_c_data,
2123                             const DeviceMemory<Eigen::half>& params,
2124                             const dnn::RnnSequenceTensorDescriptor& output_desc,
2125                             DeviceMemory<Eigen::half>* output_data,
2126                             const dnn::RnnStateTensorDescriptor& output_h_desc,
2127                             DeviceMemory<Eigen::half>* output_h_data,
2128                             const dnn::RnnStateTensorDescriptor& output_c_desc,
2129                             DeviceMemory<Eigen::half>* output_c_data,
2130                             bool is_training,
2131                             ScratchAllocator* reserve_space_allocator,
2132                             ScratchAllocator* workspace_allocator,
2133                             dnn::ProfileResult* output_profile_result) {
2134     return false;
2135   }
2136 
2137   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2138                             const dnn::RnnSequenceTensorDescriptor& input_desc,
2139                             const DeviceMemory<float>& input_data,
2140                             const dnn::RnnStateTensorDescriptor& input_h_desc,
2141                             const DeviceMemory<float>& input_h_data,
2142                             const dnn::RnnStateTensorDescriptor& input_c_desc,
2143                             const DeviceMemory<float>& input_c_data,
2144                             const DeviceMemory<float>& params,
2145                             const dnn::RnnSequenceTensorDescriptor& output_desc,
2146                             DeviceMemory<float>* output_data,
2147                             const dnn::RnnStateTensorDescriptor& output_h_desc,
2148                             DeviceMemory<float>* output_h_data,
2149                             const dnn::RnnStateTensorDescriptor& output_c_desc,
2150                             DeviceMemory<float>* output_c_data,
2151                             bool is_training,
2152                             ScratchAllocator* reserve_space_allocator,
2153                             ScratchAllocator* workspace_allocator,
2154                             dnn::ProfileResult* output_profile_result) {
2155     return false;
2156   }
2157 
2158   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2159                             const dnn::RnnSequenceTensorDescriptor& input_desc,
2160                             const DeviceMemory<double>& input_data,
2161                             const dnn::RnnStateTensorDescriptor& input_h_desc,
2162                             const DeviceMemory<double>& input_h_data,
2163                             const dnn::RnnStateTensorDescriptor& input_c_desc,
2164                             const DeviceMemory<double>& input_c_data,
2165                             const DeviceMemory<double>& params,
2166                             const dnn::RnnSequenceTensorDescriptor& output_desc,
2167                             DeviceMemory<double>* output_data,
2168                             const dnn::RnnStateTensorDescriptor& output_h_desc,
2169                             DeviceMemory<double>* output_h_data,
2170                             const dnn::RnnStateTensorDescriptor& output_c_desc,
2171                             DeviceMemory<double>* output_c_data,
2172                             bool is_training,
2173                             ScratchAllocator* reserve_space_allocator,
2174                             ScratchAllocator* workspace_allocator,
2175                             dnn::ProfileResult* output_profile_result) {
2176     return false;
2177   }
2178   // Enqueue a backward operation of the RNN model onto the stream.
2179   //
2180   // Arguments:
2181   //  stream: pointer to the stream where this operation should be enqueued to.
2182   //  rnn_desc: a RNN descriptor created by createRnnDescriptor.
2183   //  input_desc: descriptor for the input sequence.
2184   //  input_data: the device memory region that contains the input data.
2185   //  input_h_desc: descriptor for the input "h" state.
2186   //  input_h_data: the device memory region that contains the input "h" data.
2187   //  input_c_desc: descriptor for the input "c" state.
2188   //  input_c_data: the device memory region that contains the input "c" data.
2189   //    This must be specified for LSTM models.
2190   //  params: the device memory region that contains the parameters used in this
2191   //    model.
2192   //  output_desc: descriptor for the output sequence.
2193   //  output_data: the memory region that stores the output sequence data.
2194   //  output_h_desc: descriptor for the output "h" state.
2195   //  output_h_data: the memory region that stores the output "h" data.
2196   //  output_c_desc: descriptor for the output "c" state.
2197   //  output_c_data: the memory region that stores the output "c" data. This
2198   //    must be specified for LSTM models.
2199   //  output_backprop_data: the device memory region that contains the backprop
2200   //    to the output sequence.
2201   //  output_h_backprop_data: the device memory region that contains the
2202   //    backprop to the output "h" state.
2203   //  output_c_backprop_data: the device memory region that contains the
2204   //    backprop to the output "c" state.
2205   //  input_backprop_data: the device memory region that stores the backprop
2206   //    to the input sequence.
2207   //  input_h_backprop_data: the device memory region that stores the backprop
2208   //    to the input "h" state.
2209   //  input_c_backprop_data: the device memory region that stores the backprop
2210   //    to the input "c" state.
2211   //  params_backprop_data: the device memory region that stores the backprop
2212   //    to the parameters.
2213   //  reserve_space_data: the reserve_space data that is produced by the forward
2214   //    operation. This memory region could be modified by this operation.
2215   //  workspace_allocator: a memory allocator that creates the temporary
2216   //    workspace memory used by this operation. The caller is responsible for
2217   //    keeping the memory alive long enough for this operation, and recylces
2218   //    afterwards.
2219   virtual bool DoRnnBackward(
2220       Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2221       const dnn::RnnSequenceTensorDescriptor& input_desc,
2222       const DeviceMemory<Eigen::half>& input_data,
2223       const dnn::RnnStateTensorDescriptor& input_h_desc,
2224       const DeviceMemory<Eigen::half>& input_h_data,
2225       const dnn::RnnStateTensorDescriptor& input_c_desc,
2226       const DeviceMemory<Eigen::half>& input_c_data,
2227       const DeviceMemory<Eigen::half>& params,
2228       const dnn::RnnSequenceTensorDescriptor& output_desc,
2229       const DeviceMemory<Eigen::half>& output_data,
2230       const dnn::RnnStateTensorDescriptor& output_h_desc,
2231       const DeviceMemory<Eigen::half>& output_h_data,
2232       const dnn::RnnStateTensorDescriptor& output_c_desc,
2233       const DeviceMemory<Eigen::half>& output_c_data,
2234       const DeviceMemory<Eigen::half>& output_backprop_data,
2235       const DeviceMemory<Eigen::half>& output_h_backprop_data,
2236       const DeviceMemory<Eigen::half>& output_c_backprop_data,
2237       DeviceMemory<Eigen::half>* input_backprop_data,
2238       DeviceMemory<Eigen::half>* input_h_backprop_data,
2239       DeviceMemory<Eigen::half>* input_c_backprop_data,
2240       DeviceMemory<Eigen::half>* params_backprop_data,
2241       DeviceMemory<uint8>* reserve_space_data,
2242       ScratchAllocator* workspace_allocator,
2243       dnn::ProfileResult* output_profile_result) {
2244     return false;
2245   }
2246 
2247   virtual bool DoRnnBackward(
2248       Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2249       const dnn::RnnSequenceTensorDescriptor& input_desc,
2250       const DeviceMemory<float>& input_data,
2251       const dnn::RnnStateTensorDescriptor& input_h_desc,
2252       const DeviceMemory<float>& input_h_data,
2253       const dnn::RnnStateTensorDescriptor& input_c_desc,
2254       const DeviceMemory<float>& input_c_data,
2255       const DeviceMemory<float>& params,
2256       const dnn::RnnSequenceTensorDescriptor& output_desc,
2257       const DeviceMemory<float>& output_data,
2258       const dnn::RnnStateTensorDescriptor& output_h_desc,
2259       const DeviceMemory<float>& output_h_data,
2260       const dnn::RnnStateTensorDescriptor& output_c_desc,
2261       const DeviceMemory<float>& output_c_data,
2262       const DeviceMemory<float>& output_backprop_data,
2263       const DeviceMemory<float>& output_h_backprop_data,
2264       const DeviceMemory<float>& output_c_backprop_data,
2265       DeviceMemory<float>* input_backprop_data,
2266       DeviceMemory<float>* input_h_backprop_data,
2267       DeviceMemory<float>* input_c_backprop_data,
2268       DeviceMemory<float>* params_backprop_data,
2269       DeviceMemory<uint8>* reserve_space_data,
2270       ScratchAllocator* workspace_allocator,
2271       dnn::ProfileResult* output_profile_result) {
2272     return false;
2273   }
2274 
2275   virtual bool DoRnnBackward(
2276       Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2277       const dnn::RnnSequenceTensorDescriptor& input_desc,
2278       const DeviceMemory<double>& input_data,
2279       const dnn::RnnStateTensorDescriptor& input_h_desc,
2280       const DeviceMemory<double>& input_h_data,
2281       const dnn::RnnStateTensorDescriptor& input_c_desc,
2282       const DeviceMemory<double>& input_c_data,
2283       const DeviceMemory<double>& params,
2284       const dnn::RnnSequenceTensorDescriptor& output_desc,
2285       const DeviceMemory<double>& output_data,
2286       const dnn::RnnStateTensorDescriptor& output_h_desc,
2287       const DeviceMemory<double>& output_h_data,
2288       const dnn::RnnStateTensorDescriptor& output_c_desc,
2289       const DeviceMemory<double>& output_c_data,
2290       const DeviceMemory<double>& output_backprop_data,
2291       const DeviceMemory<double>& output_h_backprop_data,
2292       const DeviceMemory<double>& output_c_backprop_data,
2293       DeviceMemory<double>* input_backprop_data,
2294       DeviceMemory<double>* input_h_backprop_data,
2295       DeviceMemory<double>* input_c_backprop_data,
2296       DeviceMemory<double>* params_backprop_data,
2297       DeviceMemory<uint8>* reserve_space_data,
2298       ScratchAllocator* workspace_allocator,
2299       dnn::ProfileResult* output_profile_result) {
2300     return false;
2301   }
2302 
2303   // Transforms a tensor into another tensor with a different layout and/or data
2304   // type.
2305   //
2306   // Arguments:
2307   //  stream: pointer to the stream where this operation should be enqueued to.
2308   //  input_desc: specifies the shape and the data layout of the input tensor.
2309   //  input_type: the data type of the input tensor.
2310   //  input_data: the device memory region that contains the input tensor.
2311   //  output_desc: specifies the shape and the data layout of the output tensor.
2312   //  output_type: the data type of the output tensor.
2313   //  scale: an element-wise scaling factor to apply.
2314   //  output_data: the device memory region that contains the output tensor.
2315   virtual bool DoTransformTensor(Stream* stream,
2316                                  const dnn::BatchDescriptor& input_desc,
2317                                  dnn::DataType input_type,
2318                                  const DeviceMemoryBase& input_data,
2319                                  const dnn::BatchDescriptor& output_desc,
2320                                  dnn::DataType output_type, float scale,
2321                                  DeviceMemoryBase* output_data) {
2322     return false;
2323   }
2324 
2325   // Enqueues a fused convolution+bias+activation operation onto the stream.
2326   //
2327   // Arguments (all borrowed):
2328   //
2329   //  stream: borrowed pointer to the stream that the 'fusion' operation should
2330   //  be enqueued onto.
2331   //
2332   //  conv_input_descriptor: dimensions of the convolution input layer.
2333   //  conv_input_data: device memory which contains the convolution input.
2334   //
2335   //  filter_descriptor: dimensions of the convolution filter.
2336   //  filter_data: device memory which contains the convolution filter weights.
2337   //
2338   //  convolution_descriptor: stride of the convolution filter.
2339   //
2340   //  bias_descriptor: dimensions of the bias layer
2341   //  biases: device memory region containing biases to add to the convolution
2342   //  output
2343   //
2344   //  activation_mode: Type of activation to perform.
2345   //
2346   //  output_descriptor: dimensions of the output layer.
2347   //  output_data: device memory region in which to place the fusion result.
2348   //
2349   //  output_profile_result: the output profile result for this call.
2350   //         The profiling is only enabled when this is not nullptr.
2351   //
2352   virtual bool DoFusedConvolutionBiasActivation(
2353       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
2354       const DeviceMemory<float>& conv_input_data,
2355       const dnn::FilterDescriptor& filter_descriptor,
2356       const DeviceMemory<float>& filter_data,
2357       const dnn::ConvolutionDescriptor& convolution_descriptor,
2358       const dnn::BatchDescriptor& bias_descriptor,
2359       const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
2360       const dnn::BatchDescriptor& output_descriptor,
2361       DeviceMemory<float>* output_data,
2362       dnn::ProfileResult* output_profile_result) {
2363     return false;
2364   }
2365 
2366   // Enqueues a fused batchnorm+activation (inference) operation onto the
2367   // stream.
2368   //
2369   // Arguments (all borrowed):
2370   //
2371   //  stream: borrowed pointer to the stream that the 'fusion' operation should
2372   //  be enqueued onto.
2373   //
2374   //  x_descriptor: dimensions of the batchnorm input layer.
2375   //  x_data: device memory which contains the batchnorm input.
2376   //
2377   //  scale_offset_mean_variance_descriptor:
2378   //      dimensions of the scale/offset/mean/variance tensor.
2379   //  scale_data: device memory which contains the scale input.
2380   //  offset_data: device memory which contains the offset input.
2381   //  mean_data: device memory which contains the mean input.
2382   //  variance_data: device memory which contains the variance input.
2383   //  epsilon : the epsilon value to use in batchnorm calculation
2384   //
2385   //  activation_mode: Type of activation to perform.
2386   //
2387   //  y_data: device memory region in which to place the fusion result.
2388   //
2389   //  output_profile_result: the output profile result for this call.
2390   //         The profiling is only enabled when this is not nullptr.
2391   //
2392   virtual bool DoFusedBatchNormActivationInference(
2393       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2394       const DeviceMemory<float>& x_data,
2395       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2396       const DeviceMemory<float>& scale_data,
2397       const DeviceMemory<float>& offset_data,
2398       const DeviceMemory<float>& mean_data,
2399       const DeviceMemory<float>& variance_data, double epsilon,
2400       dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
2401       dnn::ProfileResult* output_profile_result) {
2402     return false;
2403   }
2404 
2405   virtual bool DoFusedBatchNormActivationInference(
2406       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2407       const DeviceMemory<Eigen::half>& x_data,
2408       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2409       const DeviceMemory<float>& scale_data,
2410       const DeviceMemory<float>& offset_data,
2411       const DeviceMemory<float>& mean_data,
2412       const DeviceMemory<float>& variance_data, double epsilon,
2413       dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
2414       dnn::ProfileResult* output_profile_result) {
2415     return false;
2416   }
2417 
2418   // Enqueues a fused batchnorm+activation (training-fwd) operation onto the
2419   // stream.
2420   //
2421   // Arguments (all borrowed):
2422   //
2423   //  stream: borrowed pointer to the stream that the 'fusion' operation should
2424   //  be enqueued onto.
2425   //
2426   //  x_descriptor: dimensions of the batchnorm input layer.
2427   //  x_data: device memory which contains the batchnorm input.
2428   //
2429   //  scale_offset_mean_variance_descriptor:
2430   //      dimensions of the scale/offset/mean/variance tensor.
2431   //  scale_data: device memory which contains the scale input.
2432   //  offset_data: device memory which contains the offset input.
2433   //  epsilon : the epsilon value to use in batchnorm calculation
2434   //
2435   //  activation_mode: Type of activation to perform.
2436   //
2437   //  y_data: device memory region in which to place the fusion result.
2438   //  batch_mean_data: device memory in which to place the batch mean output.
2439   //  batch_var_data: device memory in which to place the batch variance output.
2440   //  saved_mean_data: device memory in which to save the mean for bwd pass.
2441   //  saved_var_data: device memory in which to save the variance for bwd pass.
2442   //
2443   //  output_profile_result: the output profile result for this call.
2444   //         The profiling is only enabled when this is not nullptr.
2445   //
2446   virtual bool DoFusedBatchNormActivationForward(
2447       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2448       const DeviceMemory<float>& x_data,
2449       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2450       const DeviceMemory<float>& scale_data,
2451       const DeviceMemory<float>& offset_data, double epsilon,
2452       dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
2453       DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
2454       DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
2455       dnn::ProfileResult* output_profile_result) {
2456     return false;
2457   }
2458 
2459   virtual bool DoFusedBatchNormActivationForward(
2460       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2461       const DeviceMemory<Eigen::half>& x_data,
2462       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2463       const DeviceMemory<float>& scale_data,
2464       const DeviceMemory<float>& offset_data, double epsilon,
2465       dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
2466       DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
2467       DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
2468       dnn::ProfileResult* output_profile_result) {
2469     return false;
2470   }
2471 
2472   // Enqueues a fused batchnorm+activation (training-bwd) operation onto the
2473   // stream.
2474   //
2475   // Arguments (all borrowed):
2476   //
2477   //  stream: borrowed pointer to the stream that the 'fusion' operation should
2478   //  be enqueued onto.
2479   //
2480   //  y_act_backprop_descriptor: dimensions of the backprop input from the
2481   //  previous layer. y_act_backprop_data: device memory which contains the
2482   //  backprop input.
2483   //
2484   //  y_act_data: device memory which contains the actv-fwd output data.
2485   //
2486   //  activation_mode: actv-fwd type.
2487   //
2488   //  scale_offset_mean_variance_descriptor:
2489   //      dimensions of the scale/offset/mean/variance tensor.
2490   //  scale_data: device memory which contains the scale input.
2491   //  offset_data: device memory which contains the offset input.
2492   //  saved_mean_data: device memory which contains the saved mean from fwd
2493   //  pass. saved_var_data: device memory which contains the saved variance from
2494   //  fwd pass.
2495   //
2496   //  x_bn_backprop_data: device memory region in which to place the backprop
2497   //  data from this layer scale_backprop_data: device memory in which to place
2498   //  the scale backprop output. offset_backprop_data: device memory in which to
2499   //  place the offset backprop output.
2500   //
2501   //  output_profile_result: the output profile result for this call.
2502   //         The profiling is only enabled when this is not nullptr.
2503   //
2504   virtual bool DoFusedBatchNormActivationBackward(
2505       Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
2506       const DeviceMemory<float>& y_act_backprop_data,
2507       const DeviceMemory<float>& y_act_data,
2508       dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data,
2509       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2510       const DeviceMemory<float>& scale_data,
2511       const DeviceMemory<float>& offset_data,
2512       const DeviceMemory<float>& saved_mean_data,
2513       const DeviceMemory<float>& saved_var_data,
2514       DeviceMemory<float>* x_bn_backprop_data,
2515       DeviceMemory<float>* scale_backprop_data,
2516       DeviceMemory<float>* offset_backprop_data,
2517       dnn::ProfileResult* output_profile_result) {
2518     return false;
2519   }
2520 
2521   virtual bool DoFusedBatchNormActivationBackward(
2522       Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
2523       const DeviceMemory<Eigen::half>& y_act_backprop_data,
2524       const DeviceMemory<Eigen::half>& y_act_data,
2525       dnn::ActivationMode activation_mode,
2526       const DeviceMemory<Eigen::half>& x_bn_data,
2527       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2528       const DeviceMemory<float>& scale_data,
2529       const DeviceMemory<float>& offset_data,
2530       const DeviceMemory<float>& saved_mean_data,
2531       const DeviceMemory<float>& saved_var_data,
2532       DeviceMemory<Eigen::half>* x_bn_backprop_data,
2533       DeviceMemory<float>* scale_backprop_data,
2534       DeviceMemory<float>* offset_backprop_data,
2535       dnn::ProfileResult* output_profile_result) {
2536     return false;
2537   }
2538 
2539  protected:
2540   // Returns whether status is 'ok', and potentially logs the error.
2541   static bool IsStatusOk(const port::Status& status, bool report_error);
2542 
2543  private:
2544   virtual port::Status DoPrepareForConvolution(
2545       ConvolutionKind kind, DataType element_type, Stream* stream,
2546       const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data,
2547       const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
2548       const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
2549       const ConvolutionDescriptor& convolution_descriptor,
2550       const AlgorithmConfig& algorithm_config,
2551       ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
2552       DeviceMemory<uint8>* scratch_memory) {
2553     *algorithm_desc = {};
2554     *scratch_memory = {};
2555     return port::Status::OK();
2556   }
2557 
2558   SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
2559 };
2560 
2561 }  // namespace dnn
2562 }  // namespace stream_executor
2563 
2564 #endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
2565