1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Neural Net operation support for StreamExecutor instances.
17 //
18 // This is an abstract interface for a platform to optionally support common
19 // neural net operations; it accommodates implementations such as the cudnn
20 // library operations.
21 
22 #ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_
23 #define TENSORFLOW_STREAM_EXECUTOR_DNN_H_
24 
25 #include <functional>
26 #include <limits>
27 #include <memory>
28 #include <tuple>
29 #include <type_traits>
30 
31 #include "absl/types/optional.h"
32 #include "absl/types/span.h"
33 #include "tensorflow/stream_executor/data_type.h"
34 #include "tensorflow/stream_executor/device_memory.h"
35 #include "tensorflow/stream_executor/dnn.pb.h"
36 #include "tensorflow/stream_executor/lib/array_slice.h"
37 #include "tensorflow/stream_executor/lib/status.h"
38 #include "tensorflow/stream_executor/lib/statusor.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/port.h"
41 
42 namespace Eigen {
43 struct half;
44 }  // namespace Eigen
45 
46 namespace stream_executor {
47 
48 class HostBuffer;
49 class Stream;
50 class ScratchAllocator;
51 
52 namespace dnn {
53 
54 // Specifies an index to use when accessing specific spatial dimensions.
55 enum class DimIndex : int {
56   X = 0,
57   Y = 1,
58   Z = 2,
59 };
60 
61 // Helper functions to make methods more readable.
GetDim(absl::Span<const int64> data,DimIndex dim)62 inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) {
63   return data.rbegin()[static_cast<int64>(dim)];
64 }
65 
SetDim(absl::Span<int64> data,DimIndex dim,int64 value)66 inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) {
67   data.rbegin()[static_cast<int64>(dim)] = value;
68 }
69 
SetDim(std::vector<int64> * data,DimIndex dim,int64 value)70 inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) {
71   return SetDim(absl::MakeSpan(*data), dim, value);
72 }
73 
74 // int64 is not the same type as tensorflow::protobuf_int64 in open-source. This
75 // wrapper function gives an int64 array slice view of a repeated int64 protobuf
76 // field.
77 //
78 // T should be a protobuf RepeatedField.
79 template <typename T>
AsInt64Slice(const T & repeated_field)80 inline absl::Span<const int64> AsInt64Slice(const T& repeated_field) {
81   using data_ty =
82       typename std::remove_reference<decltype(*repeated_field.data())>::type;
83   static_assert(std::is_integral<data_ty>::value &&
84                     std::is_signed<data_ty>::value && sizeof(data_ty) == 8,
85                 "repeated_field.data() must return a pointer to a signed "
86                 "64-bit integer type.");
87   return absl::Span<const int64>(
88       reinterpret_cast<const int64*>(repeated_field.data()),
89       repeated_field.size());
90 }
91 template <typename T>
AsInt64Slice(T * repeated_field)92 inline absl::Span<int64> AsInt64Slice(T* repeated_field) {
93   using data_ty =
94       typename std::remove_reference<decltype(*repeated_field->data())>::type;
95   static_assert(std::is_integral<data_ty>::value &&
96                     std::is_signed<data_ty>::value && sizeof(data_ty) == 8,
97                 "repeated_field->data() must return a pointer to a signed "
98                 "64-bit integer type.");
99   return absl::Span<int64>(
100       reinterpret_cast<int64*>(repeated_field->mutable_data()),
101       repeated_field->size());
102 }
103 
104 // Returns a string representation of the given data layout.
105 std::string DataLayoutString(DataLayout layout);
106 
107 // Specifies a quantization for activations in a given BatchDescriptor.
108 enum class QuantizedActivationMode {
109   k8Bit = 1,
110   k16Bit = 2,
111   k32Bit = 4,
112 };
113 
114 // Specifies the types of a RNN model.
115 enum class RnnMode {
116   kRnnRelu = 0,
117   kRnnTanh = 1,
118   kRnnLstm = 2,
119   kRnnGru = 3,
120 };
121 
122 // Specifies the input model and whether there is a linear transformation
123 // between the input state and the first layer hidden state.
124 enum class RnnInputMode {
125   kRnnLinearSkip = 0,
126   kRnnSkipInput = 1,
127 };
128 
129 // Specifies the number of directions used in a RNN model. When bidirection
130 // is used, the input states and output sequence contain data for both
131 // directions.
132 enum class RnnDirectionMode {
133   kRnnUnidirectional = 0,
134   kRnnBidirectional = 1,
135 };
136 
137 // Relevant to DepthToSpace and SpaceToDepth. This is the write layout when
138 // performing depth to space and the read layout when performing space to depth.
139 // It's specified with most-major dimension first and most-minor dimension last.
140 // In DepthToSpace, the D*M^2 values are read in and then, for DepthHeightWidth,
141 // written out to the output patch, by varying first width, then height, then
142 // depth. In C array format, it looks like [depth][height][width]. See
143 // DepthToSpace comment for more information.
144 enum class DepthToSpaceLayout { DepthHeightWidth };
145 
146 // Specifies the descriptor for a RNN model.
147 //
148 // An example use case:
149 //   * The user first creates a model through createRnnDescriptor.
150 //   * The user queries the size of the underlying opaque parameter buffer.
151 //   * The user creates and initializes a parameter buffer of the proper size.
152 //   * The user runs forward and backward operations using this RNN descriptor.
153 //   * Once a while, user queries maintainable weights and bias regions from
154 //       the underlying parameter buffer. They are more likely to be forward
155 //       compatible and should used in saving and restoring a model.
156 //   * The user releases the RNN descriptor when the model is no longer in use.
157 class RnnDescriptor {
158  public:
159   struct ParamsRegion {
160     int64 offset;
161     int64 size;
162   };
163   typedef std::vector<ParamsRegion> ParamsRegions;
~RnnDescriptor()164   virtual ~RnnDescriptor() {}
ParamsSizeInBytes()165   virtual int64 ParamsSizeInBytes() const { return -1; }
ParamsWeightRegions()166   virtual ParamsRegions ParamsWeightRegions() const { return ParamsRegions(); }
ParamsBiasRegions()167   virtual ParamsRegions ParamsBiasRegions() const { return ParamsRegions(); }
168 };
169 
170 // Specifies the sequence in a RNN model.
171 //
172 // The user is responsible for releasing this descriptor when it is no longer
173 // in use. The destructor releases the underlying descriptors.
174 class RnnSequenceTensorDescriptor {
175  public:
~RnnSequenceTensorDescriptor()176   virtual ~RnnSequenceTensorDescriptor() {}
177 };
178 
179 // Specifies either the input and hidden state in a RNN model.
180 //
181 // The user is responsible for releasing this descriptor when it is no longer
182 // in use. The destructor releases the underlying descriptors.
183 class RnnStateTensorDescriptor {
184  public:
~RnnStateTensorDescriptor()185   virtual ~RnnStateTensorDescriptor() {}
186 };
187 
188 // Returns a string representation of the given quantization mode.
189 std::string QuantizedActivationModeString(QuantizedActivationMode mode);
190 
191 // Describes the dimensions that a layer consumes/produces.
192 //
193 // This is a matrix (height, width), its "depth" (feature_map_count),
194 // how many of these matrices are present (count),
195 // and the maximum and minimum values expected in the matrix (value_max,
196 // value_min).
197 // If input is quantized, all values greater
198 // than value_max will be clipped to value_max and all values less than
199 // value_min will be clipped to value_min.
200 // When quantized output is dequantized no value will be greater than
201 // value_max or less than value_min.
202 //
203 // Uses the named argument construction form:
204 //
205 //  auto input_batch_dimensions =
206 //      BatchDescriptor().set_count(42).set_feature_map_count(7)...
207 //
208 // Details:
209 //
210 // For a convolutional layer, a single inference takes a 3-dimensional matrix
211 // of input and produces a 3-dimensional matrix of output. We call the three
212 // dimensions height, width and feature_map_count, where for an image, the
213 // height and width correspond to the Y and X pixel indices, respectively, and
214 // the feature_map_count corresponds to the RGB dimension of the input data.
215 // Then the count indicates how many 3D matrices are being presented to be
216 // processed at once; this corresponds to the neural network concept of
217 // minibatch size.
218 //
219 // For a fully connected layer, it's better to put the nodes of the layer in
220 // the feature_map_count, and leave the height and weight as degenerate (== 1).
221 // Count indicates how many input vectors (degenerate 3D matrices) are to be
222 // processed.
223 //
224 // If unspecified, value_max and value_min default to 0.0.
225 // If value_max == value_min the Stream will attempt to derive valid values -
226 // for example the output of Relu6 activation will always be in the range
227 // [0.0, 6.0].
228 //
229 // If unspecified, layout defaults to kYXDepthBatch.
230 class BatchDescriptor {
231  public:
232   // Creates a "blank" batch descriptor, which should be initialized via the
233   // named argument helpers.
234   BatchDescriptor();
235   explicit BatchDescriptor(int ndims);
236 
237   // Clones values from 'other' for initialization.
238   void CloneFrom(const BatchDescriptor& other);
239 
240   std::string ToString() const;
241   std::string ToShortString() const;
242 
243   // Pre-condition:
244   //   value_max_ == 0
245   //   value_min_ == 0
246   //   quantized_activation_mode_ == QuantizedActivationMode::k8Bit
247   TensorDescriptorProto ToProto(DataType data_type) const;
248 
249   // Accessors.
count()250   int64 count() const { return tensor_.dimensions(0); }
feature_map_count()251   int64 feature_map_count() const { return tensor_.dimensions(1); }
height()252   int64 height() const { return GetDim(spatial_size(), DimIndex::Y); }
width()253   int64 width() const { return GetDim(spatial_size(), DimIndex::X); }
spatial_dim(DimIndex dim)254   int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); }
ndims()255   int ndims() const { return spatial_size().size(); }
value_max()256   float value_max() const { return value_max_; }
value_min()257   float value_min() const { return value_min_; }
layout()258   DataLayout layout() const { return tensor_.data_layout(); }
quantized_activation_mode()259   QuantizedActivationMode quantized_activation_mode() const {
260     return quantized_activation_mode_;
261   }
262   // Full dimensions of the underlying data, ordered according to a specific
263   // layout.
264   std::vector<int64> full_dims(const DataLayout& layout) const;
265 
266   // Full strides of the underlying data, ordered according to a specific
267   // layout.
268   std::vector<int64> full_strides(const DataLayout& layout) const;
269 
270   // Named-argument helpers for avoiding user error during construction.
set_count(int64 value)271   BatchDescriptor& set_count(int64 value) {
272     tensor_.set_dimensions(0, value);
273     return *this;
274   }
set_feature_map_count(int64 value)275   BatchDescriptor& set_feature_map_count(int64 value) {
276     tensor_.set_dimensions(1, value);
277     return *this;
278   }
set_height(int64 value)279   BatchDescriptor& set_height(int64 value) {
280     SetDim(spatial_size(), DimIndex::Y, value);
281     return *this;
282   }
set_width(int64 value)283   BatchDescriptor& set_width(int64 value) {
284     SetDim(spatial_size(), DimIndex::X, value);
285     return *this;
286   }
set_spatial_dim(DimIndex dim,int64 value)287   BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
288     SetDim(spatial_size(), dim, value);
289     return *this;
290   }
set_value_max(float value)291   BatchDescriptor& set_value_max(float value) {
292     value_max_ = value;
293     return *this;
294   }
set_value_min(float value)295   BatchDescriptor& set_value_min(float value) {
296     value_min_ = value;
297     return *this;
298   }
set_layout(DataLayout layout)299   BatchDescriptor& set_layout(DataLayout layout) {
300     tensor_.set_data_layout(layout);
301     return *this;
302   }
set_quantized_activation_mode(QuantizedActivationMode quantized_activation_mode)303   BatchDescriptor& set_quantized_activation_mode(
304       QuantizedActivationMode quantized_activation_mode) {
305     quantized_activation_mode_ = quantized_activation_mode;
306     return *this;
307   }
308 
309   // Return the number of nodes in a single feature map.
310   int64 NodesPerFeatureMap() const;
311 
312   // Return the number of nodes across all feature maps. Note that this is not
313   // affected by the batch count.
314   int64 NodesAcrossFeatureMaps() const;
315 
316   // Returns the number of elements (e.g. RGB pixel values) required to hold a
317   // given batch descriptor, given a no-padding assumption. Note that this is
318   // affected by the batch count.
319   int64 ElementCount() const;
320 
321   // Return the number of weights required to fully connect a layer with
322   // dimensions given by the 'input' descriptor with a layer with dimensions
323   // given by the 'output' descriptor.
324   static int64 FullyConnectedWeightCount(const BatchDescriptor& input,
325                                          const BatchDescriptor& output);
326 
327   // Return the number of biases required to fully connect to an output layer
328   // with dimensions given the 'output' descriptor.
329   static int64 FullyConnectedBiasCount(const BatchDescriptor& output);
330 
331   // Return a BatchDescriptor for the output of a depth concatenation
332   // with the given input descriptors. The inputs should have the same
333   // dimensions, except possibly for feature_map_count(), though this
334   // function does not verify that.
335   static BatchDescriptor DepthConcatenateOutputDescriptor(
336       port::ArraySlice<dnn::BatchDescriptor> inputs);
337 
338  private:
spatial_size()339   absl::Span<const int64> spatial_size() const {
340     return AsInt64Slice(tensor_.dimensions()).subspan(2);
341   }
342 
spatial_size()343   absl::Span<int64> spatial_size() {
344     return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
345   }
346 
347   TensorDescriptorProto tensor_;
348   float value_max_;
349   float value_min_;
350   QuantizedActivationMode quantized_activation_mode_;
351 };
352 
353 // Returns a string representation of the given filter layout.
354 std::string FilterLayoutString(FilterLayout layout);
355 
356 // Describes a filter for the convolution. This is the "window" from
357 // height-by-width patches of each of the feature maps in the input layer to the
358 // cells within the output feature map.
359 //
360 // Uses the named argument construction form:
361 //
362 //  FilterDescriptor filter_dimensions;
363 //  filter_dimensions
364 //    .set_output_feature_map_count(42)
365 //    .set_input_feature_map_count(7)
366 //    ...
367 //
368 // Arguments:
369 // - output_feature_map_count: number of feature maps in the output layer.
370 // - input_feature_map_count: number of feature maps in the input layer (from
371 //      which the filter patch is taken).
372 // - input_filter_height: "height" number of neurons used in the sliding window
373 //      over the input layer.
374 // - input_filter_width: "width" number of neurons used in the sliding window
375 //      over the input layer.
376 //
377 // Sometimes names like "filter input height" are referred to by synonymous
378 // terminology, such as "kernel y size".
379 //
380 // If unspecified, layout defaults to kOutputInputYX.
381 class FilterDescriptor {
382  public:
383   // By default construction, all dimensions are set to zero, so they should all
384   // be populated by the user via the named-argument helpers below. (See class
385   // comment for details.)
386   FilterDescriptor();
387   explicit FilterDescriptor(int ndims);
388   ~FilterDescriptor();
389 
390   // Named-argument helpers for avoiding user error during construction.
set_output_feature_map_count(int64 value)391   FilterDescriptor& set_output_feature_map_count(int64 value) {
392     tensor_.set_dimensions(0, value);
393     return *this;
394   }
set_input_feature_map_count(int64 value)395   FilterDescriptor& set_input_feature_map_count(int64 value) {
396     tensor_.set_dimensions(1, value);
397     return *this;
398   }
set_input_filter_height(int64 value)399   FilterDescriptor& set_input_filter_height(int64 value) {
400     SetDim(input_filter_dims(), DimIndex::Y, value);
401     return *this;
402   }
set_input_filter_width(int64 value)403   FilterDescriptor& set_input_filter_width(int64 value) {
404     SetDim(input_filter_dims(), DimIndex::X, value);
405     return *this;
406   }
set_layout(FilterLayout layout)407   FilterDescriptor& set_layout(FilterLayout layout) {
408     tensor_.set_filter_layout(layout);
409     return *this;
410   }
set_spatial_dim(DimIndex dim,int64 value)411   FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
412     SetDim(input_filter_dims(), dim, value);
413     return *this;
414   }
ndims()415   int ndims() const { return input_filter_dims().size(); }
416 
417   void CloneFrom(const FilterDescriptor& other);
418 
419   std::string ToString() const;
420   std::string ToShortString() const;
421   TensorDescriptorProto ToProto(DataType data_type) const;
422 
423   // Returns the number of weights required as parameters for a convolution
424   // using this filter descriptor.
425   int64 ComputeWeightCount() const;
426 
427   // Returns the number of biases required as parameters for a convolution
428   // using this filter descriptor.
bias_count()429   int64 bias_count() const { return output_feature_map_count(); }
430 
output_feature_map_count()431   int64 output_feature_map_count() const { return tensor_.dimensions(0); }
input_feature_map_count()432   int64 input_feature_map_count() const { return tensor_.dimensions(1); }
input_filter_height()433   int64 input_filter_height() const {
434     return GetDim(input_filter_dims(), DimIndex::Y);
435   }
input_filter_width()436   int64 input_filter_width() const {
437     return GetDim(input_filter_dims(), DimIndex::X);
438   }
input_filter_dim(DimIndex dim)439   int64 input_filter_dim(DimIndex dim) const {
440     return GetDim(input_filter_dims(), dim);
441   }
442 
layout()443   FilterLayout layout() const { return tensor_.filter_layout(); }
444 
input_filter_dims()445   absl::Span<const int64> input_filter_dims() const {
446     return AsInt64Slice(tensor_.dimensions()).subspan(2);
447   }
448 
449  private:
input_filter_dims()450   absl::Span<int64> input_filter_dims() {
451     return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
452   }
453 
454   TensorDescriptorProto tensor_;
455 };
456 
457 // Describes how padding should be aligned when the total number of pad
458 // elements is odd.
459 enum class PadAlignment : int64 {
460   kDefault = 0,        // default padding for the device.
461   kCudnnPadding,       // cuDNN padding - prefer to pad at the start.
462   kTensorFlowPadding,  // TensorFlow padding - prefer to pad at the end.
463 };
464 
465 // Returns a string representation of the given padding alignment.
466 std::string PadAlignmentString(PadAlignment alignment);
467 
468 // Print alignment to str. Needed to use CHECK_EQ between two PadAlignments.
469 std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
470 
471 // Describes a convolution.
472 //
473 // Uses the named argument construction form:
474 //
475 //  ConvolutionDescriptor convolution_dimensions;
476 //  convolution_dimensions
477 //    .set_vertical_filter_stride(2)
478 //    .set_horizontal_filter_stride(2)
479 //    ...
480 //
481 // Arguments:
482 // - zero_padding_height: padding of the "y dimension" of the input data. Note
483 //    that this is different from the height of the filter.
484 // - zero_padding_width: analogous to the height above, but in the "x
485 //    dimension".
486 // - vertical_filter_stride: the convolution slides a 2-dimensional window of
487 //    filter-height-by-filter-width over the input layer -- the center of that
488 //    window is moved in the "y dimension" according to this stride value.
489 // - horizontal_filter_stride: analogous to the vertical stride above, but in
490 //    the "x dimension".
491 // - vertical_dilation_rate: there will be (vertical_dilation_rate - 1) skipped
492 //   cells between each filter element in the "y dimension".
493 // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1)
494 //   skipped cells between each filter element in the "x dimension".
495 // - convolution_not_crosscor: By default (convolution_not_crosscor == false),
496 //   we perform cross correlation rather than convolution. With the flag set,
497 //   we perform convolution. Convolution and cross correlation are related by
498 //   rotating the filter by 180 degrees (or equivalently flipping all spatial
499 //   dimensions).
500 class ConvolutionDescriptor {
501  public:
502   // By default construction, there is no zero-padding and the filter stride is
503   // 1x1 (centering the filter on every cell in the input layer's
504   // width-by-height area).
505   ConvolutionDescriptor();
506   explicit ConvolutionDescriptor(int ndims);
507   ~ConvolutionDescriptor();
508 
509   std::string ToString() const;
510   std::string ToShortString() const;
ToProto()511   ConvolutionDescriptorProto ToProto() const { return proto_; }
512 
set_zero_padding_height(int64 value)513   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
514     SetDim(padding(), DimIndex::Y, value);
515     return *this;
516   }
set_zero_padding_width(int64 value)517   ConvolutionDescriptor& set_zero_padding_width(int64 value) {
518     SetDim(padding(), DimIndex::X, value);
519     return *this;
520   }
set_zero_padding(DimIndex dim,int64 value)521   ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) {
522     SetDim(padding(), dim, value);
523     return *this;
524   }
set_vertical_filter_stride(int64 value)525   ConvolutionDescriptor& set_vertical_filter_stride(int64 value) {
526     SetDim(strides(), DimIndex::Y, value);
527     return *this;
528   }
set_horizontal_filter_stride(int64 value)529   ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) {
530     SetDim(strides(), DimIndex::X, value);
531     return *this;
532   }
set_filter_stride(DimIndex dim,int64 value)533   ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) {
534     SetDim(strides(), dim, value);
535     return *this;
536   }
set_vertical_dilation_rate(int64 value)537   ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) {
538     SetDim(dilations(), DimIndex::Y, value);
539     return *this;
540   }
set_horizontal_dilation_rate(int64 value)541   ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) {
542     SetDim(dilations(), DimIndex::X, value);
543     return *this;
544   }
set_dilation_rate(DimIndex dim,int64 value)545   ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) {
546     SetDim(dilations(), dim, value);
547     return *this;
548   }
set_group_count(int group_count)549   ConvolutionDescriptor& set_group_count(int group_count) {
550     proto_.set_group_count(group_count);
551     return *this;
552   }
set_convolution_not_crosscorr(bool conv)553   ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) {
554     proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION
555                                      : ConvolutionMode::CROSS_CORRELATION);
556     return *this;
557   }
set_name(const std::string & name)558   ConvolutionDescriptor& set_name(const std::string& name) {
559     proto_.set_name(name);
560     return *this;
561   }
zero_padding_height()562   int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); }
zero_padding_width()563   int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); }
vertical_filter_stride()564   int64 vertical_filter_stride() const {
565     return GetDim(strides(), DimIndex::Y);
566   }
horizontal_filter_stride()567   int64 horizontal_filter_stride() const {
568     return GetDim(strides(), DimIndex::X);
569   }
vertical_dilation_rate()570   int64 vertical_dilation_rate() const {
571     return GetDim(dilations(), DimIndex::Y);
572   }
horizontal_dilation_rate()573   int64 horizontal_dilation_rate() const {
574     return GetDim(dilations(), DimIndex::X);
575   }
576 
zero_padding(DimIndex dim)577   int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); }
filter_stride(DimIndex dim)578   int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); }
dilation_rate(DimIndex dim)579   int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); }
580   // TODO(timshen): remove this function. No users of this class is setting a
581   // non-default pad alignment.
pad_alignment()582   PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
group_count()583   int group_count() const { return proto_.group_count(); }
ndims()584   int ndims() const { return padding().size(); }
convolution_not_crosscorr()585   bool convolution_not_crosscorr() const {
586     return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION;
587   }
588 
strides()589   absl::Span<const int64> strides() const {
590     return AsInt64Slice(proto_.strides());
591   }
592 
dilations()593   absl::Span<const int64> dilations() const {
594     return AsInt64Slice(proto_.dilations());
595   }
596 
padding()597   absl::Span<const int64> padding() const {
598     return AsInt64Slice(proto_.paddings());
599   }
600 
name()601   std::string name() const { return proto_.name(); }
602 
603  private:
strides()604   absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); }
605 
dilations()606   absl::Span<int64> dilations() {
607     return AsInt64Slice(proto_.mutable_dilations());
608   }
609 
padding()610   absl::Span<int64> padding() {
611     return AsInt64Slice(proto_.mutable_paddings());
612   }
613 
614   ConvolutionDescriptorProto proto_;
615 
616   // TODO(leary) cudnn provides these fields, but need to characterize what
617   // their effect is -- they may be boolean rather than integral.
618   // int64 upscale_input_x;
619   // int64 upscale_input_y;
620 };
621 
622 // A patch of values in the input can be pooled via either a max or an average
623 // operation.
624 // Specify int64 so there's no padding in PoolingDescriptor.
625 enum class PoolingMode : int64 {
626   kMaximum,
627   kAverage,
628 };
629 
630 // Specify the dimension in which to concatenate inputs in space.
631 // Specify int64 so there's no padding in SpaceConcatenateMode.
632 enum class SpaceConcatenateMode : int64 {
633   XDirection,
634   YDirection,
635 };
636 
637 // Returns a short name for the pooling mode, e.g. "Avg".
638 std::string ShortPoolingModeString(PoolingMode mode);
639 
640 // Describes a pooling operation to be enqueued onto a stream via a platform's
641 // DnnSupport.
642 //
643 // TODO(broune): describe how padding works and what happens if the
644 // window height/width is not divisible by the vertical/horizontal
645 // stride.
646 //
647 // Arguments:
648 //  pooling_mode: pooling operator to use on the input patch
649 //  window_height: height of input window
650 //  window_width: width of input window
651 //  vertical_stride: vertical delta for center of the input patch
652 //  horizontal_stride: horizontal delta for center of the input patch
653 class PoolingDescriptor {
654  public:
655   PoolingDescriptor();
656   explicit PoolingDescriptor(int ndims);
657 
set_pooling_mode(PoolingMode value)658   PoolingDescriptor& set_pooling_mode(PoolingMode value) {
659     mode_ = value;
660     return *this;
661   }
set_window_height(int64 value)662   PoolingDescriptor& set_window_height(int64 value) {
663     SetDim(&window_, DimIndex::Y, value);
664     return *this;
665   }
set_window_width(int64 value)666   PoolingDescriptor& set_window_width(int64 value) {
667     SetDim(&window_, DimIndex::X, value);
668     return *this;
669   }
set_window(DimIndex dim,int64 value)670   PoolingDescriptor& set_window(DimIndex dim, int64 value) {
671     SetDim(&window_, dim, value);
672     return *this;
673   }
set_vertical_padding(int64 value)674   PoolingDescriptor& set_vertical_padding(int64 value) {
675     SetDim(&padding_, DimIndex::Y, value);
676     return *this;
677   }
set_horizontal_padding(int64 value)678   PoolingDescriptor& set_horizontal_padding(int64 value) {
679     SetDim(&padding_, DimIndex::X, value);
680     return *this;
681   }
set_padding(DimIndex dim,int64 value)682   PoolingDescriptor& set_padding(DimIndex dim, int64 value) {
683     SetDim(&padding_, dim, value);
684     return *this;
685   }
set_vertical_stride(int64 value)686   PoolingDescriptor& set_vertical_stride(int64 value) {
687     SetDim(&strides_, DimIndex::Y, value);
688     return *this;
689   }
set_horizontal_stride(int64 value)690   PoolingDescriptor& set_horizontal_stride(int64 value) {
691     SetDim(&strides_, DimIndex::X, value);
692     return *this;
693   }
set_stride(DimIndex dim,int64 value)694   PoolingDescriptor& set_stride(DimIndex dim, int64 value) {
695     SetDim(&strides_, dim, value);
696     return *this;
697   }
set_propagate_nans(bool value)698   PoolingDescriptor& set_propagate_nans(bool value) {
699     propagate_nans_ = value;
700     return *this;
701   }
set_name(const std::string & name)702   PoolingDescriptor& set_name(const std::string& name) {
703     name_ = name;
704     return *this;
705   }
706 
ndims()707   int ndims() const { return ndims_; }
708   void CloneFrom(const PoolingDescriptor& other);
709 
710   std::string ToString() const;
711   std::string ToShortString() const;
712 
mode()713   PoolingMode mode() const { return mode_; }
window_height()714   int64 window_height() const { return GetDim(window_, DimIndex::Y); }
window_width()715   int64 window_width() const { return GetDim(window_, DimIndex::X); }
window(DimIndex dim)716   int64 window(DimIndex dim) const { return GetDim(window_, dim); }
vertical_padding()717   int64 vertical_padding() const { return GetDim(padding_, DimIndex::Y); }
horizontal_padding()718   int64 horizontal_padding() const { return GetDim(padding_, DimIndex::X); }
padding(DimIndex dim)719   int64 padding(DimIndex dim) const { return GetDim(padding_, dim); }
vertical_stride()720   int64 vertical_stride() const { return GetDim(strides_, DimIndex::Y); }
horizontal_stride()721   int64 horizontal_stride() const { return GetDim(strides_, DimIndex::X); }
stride(DimIndex dim)722   int64 stride(DimIndex dim) const { return GetDim(strides_, dim); }
window()723   absl::Span<const int64> window() const { return window_; }
padding()724   absl::Span<const int64> padding() const { return padding_; }
strides()725   absl::Span<const int64> strides() const { return strides_; }
propagate_nans()726   bool propagate_nans() const { return propagate_nans_; }
name()727   std::string name() const { return name_; }
728 
729  private:
730   PoolingMode mode_;
731   int ndims_;
732   bool propagate_nans_;
733   std::string name_;  // Name as in Tensorflow NodeDef, for debugging purposes.
734 
735   // Stored as: ..., y, x.
736   std::vector<int64> window_;
737   std::vector<int64> padding_;
738   std::vector<int64> strides_;
739 };
740 
741 // Collects parameters for DNN algorithms
742 class AlgorithmDesc {
743  public:
744   typedef int64 Index;
AlgorithmDesc()745   AlgorithmDesc() : AlgorithmDesc(0, false) {}
AlgorithmDesc(Index a,bool use_tensor_ops)746   AlgorithmDesc(Index a, bool use_tensor_ops) {
747     proto_.set_algo_id(a);
748     proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
749                                         : AlgorithmProto::DEFAULT_MATH);
750   }
tensor_ops_enabled()751   bool tensor_ops_enabled() const {
752     return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH;
753   }
algo_id()754   Index algo_id() const { return proto_.algo_id(); }
755   bool operator==(const AlgorithmDesc& other) const {
756     return algo_id() == other.algo_id() &&
757            tensor_ops_enabled() == other.tensor_ops_enabled();
758   }
759   uint64 hash() const;
760 
ToProto()761   AlgorithmProto ToProto() const { return proto_; }
762 
763   std::string ToString() const;
764 
765  private:
766   AlgorithmProto proto_;
767 };
768 
769 // Describes the result from a perf experiment.
770 //
771 // Arguments:
772 //  algorithm: returns the exact algorithm that was used.
773 //  elapsed_time_in_ms: returns the measured elapsed time in milliseconds.
774 class ProfileResult {
775  public:
is_valid()776   bool is_valid() const {
777     return algorithm_.has_value() &&
778            elapsed_time_in_ms() != std::numeric_limits<float>::max();
779   }
780 
algorithm()781   AlgorithmDesc algorithm() const { return *algorithm_; }
set_algorithm(AlgorithmDesc val)782   void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
783 
elapsed_time_in_ms()784   float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
set_elapsed_time_in_ms(float val)785   void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
786 
scratch_size()787   size_t scratch_size() const { return scratch_size_; }
set_scratch_size(size_t val)788   void set_scratch_size(size_t val) { scratch_size_ = val; }
789 
790  private:
791   absl::optional<AlgorithmDesc> algorithm_;
792   float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
793   // The scratch size algorithm_ requires. Currently it's only populated by
794   // convolutions.
795   size_t scratch_size_ = 0;
796 };
797 
798 // Describes the configuration for the algorithms that will used.
799 //
800 // Arguments:
801 //  algorithm: the primary algorithm that should be used.
802 //  algorithm_no_scratch: a secondary algorithm that should be used, if the
803 //    the allocation for the scratch memory fails.
804 //  scrach_size: specify the size of scratch memory in bytes needed for the
805 //    algorithm used.
806 //
807 // On CUDA platform with CUDNN library, algorithm and algorithm_no_scratch
808 // would be used. On ROCm platform with MIOpen library, algorithm and
809 // scratch_size would be used. The major difference between the two platforms
810 // are whether it's possible to get an algorithm without scratch memory. On
811 // CUDA + CUDNN it's possible, and algorithm_no_scratch can be used to track
812 // such information, whereas on ROCm + MIOpen there is no guarantee to getting
813 // one without scratch memory, and scratch_size field is used to track it.
814 class AlgorithmConfig {
815  public:
AlgorithmConfig()816   AlgorithmConfig() {}
AlgorithmConfig(AlgorithmDesc algorithm)817   explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {}
AlgorithmConfig(AlgorithmDesc algorithm,size_t scratch_size)818   AlgorithmConfig(AlgorithmDesc algorithm, size_t scratch_size)
819       : algorithm_(algorithm), scratch_size_(scratch_size) {}
AlgorithmConfig(AlgorithmDesc algorithm,AlgorithmDesc algorithm_no_scratch)820   AlgorithmConfig(AlgorithmDesc algorithm, AlgorithmDesc algorithm_no_scratch)
821       : algorithm_(algorithm), algorithm_no_scratch_(algorithm_no_scratch) {}
algorithm()822   absl::optional<AlgorithmDesc> algorithm() const { return algorithm_; }
set_algorithm(AlgorithmDesc val)823   void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
algorithm_no_scratch()824   absl::optional<AlgorithmDesc> algorithm_no_scratch() const {
825     return algorithm_no_scratch_;
826   }
set_algorithm_no_scratch(AlgorithmDesc val)827   void set_algorithm_no_scratch(AlgorithmDesc val) {
828     algorithm_no_scratch_ = val;
829   }
scratch_size()830   absl::optional<size_t> scratch_size() const { return scratch_size_; }
set_scratch_size(size_t val)831   void set_scratch_size(size_t val) { scratch_size_ = val; }
832   bool operator==(const AlgorithmConfig& other) const {
833     return this->algorithm_ == other.algorithm_ &&
834            this->algorithm_no_scratch_ == other.algorithm_no_scratch_ &&
835            this->scratch_size_ == other.scratch_size_;
836   }
837   bool operator!=(const AlgorithmConfig& other) const {
838     return !(*this == other);
839   }
840   std::string ToString() const;
841 
842  private:
843   absl::optional<AlgorithmDesc> algorithm_;
844   absl::optional<AlgorithmDesc> algorithm_no_scratch_;
845   absl::optional<size_t> scratch_size_;
846 };
847 
848 // Describes a local response normalization (LRN). LRN is used e.g. in
849 // dist_belief.
850 //
851 // Let V be the vector of feature maps at some (batch, y, x)
852 // coordinate. LRN applies independently to each vector V in the
853 // input, across all coordinates (batch, y, x), by mapping each V to
854 // another vector U of the same size using the formula
855 //
856 //   U_i = V_i / ((bias + alpha * (sum_j V_j^2)) ^ beta)
857 //
858 // where the sum is taken over j in the closed range [i - range, i + range].
859 //
860 // When calculating U_i the j in the sum can extend beyond the bounds
861 // of V. If wrap_around is true, then V_j = V_{j mod F} where F is the
862 // size of V, which is the number of feature maps. If wrap_around is
863 // false, then V_j = 0 for j outside [0, F-1].
864 //
865 // If segment_size <= F, where F is the number of feature_maps, then
866 // segment_size has no effect. Otherwise, each consecutive segment of
867 // segment_size entries in V are normalized separately.
868 //
869 // Not all StreamExecutors allow wrap_around == true or segment_size
870 // != 64. Some do not implement normalization at all.
871 class NormalizeDescriptor {
872  public:
873   NormalizeDescriptor();
874 
set_bias(float bias)875   NormalizeDescriptor& set_bias(float bias) {
876     bias_ = bias;
877     return *this;
878   }
879 
set_range(int32 range)880   NormalizeDescriptor& set_range(int32 range) {
881     range_ = range;
882     return *this;
883   }
884 
set_alpha(float alpha)885   NormalizeDescriptor& set_alpha(float alpha) {
886     alpha_ = alpha;
887     return *this;
888   }
889 
set_beta(float beta)890   NormalizeDescriptor& set_beta(float beta) {
891     beta_ = beta;
892     return *this;
893   }
894 
set_wrap_around(bool wrap_around)895   NormalizeDescriptor& set_wrap_around(bool wrap_around) {
896     wrap_around_ = wrap_around;
897     return *this;
898   }
899 
set_segment_size(int32 segment_size)900   NormalizeDescriptor& set_segment_size(int32 segment_size) {
901     segment_size_ = segment_size;
902     return *this;
903   }
904 
905   void CloneFrom(const NormalizeDescriptor& other);
906 
907   std::string ToString() const;
908   std::string ToShortString() const;
909 
bias()910   float bias() const { return bias_; }
range()911   int32 range() const { return range_; }
alpha()912   float alpha() const { return alpha_; }
beta()913   float beta() const { return beta_; }
wrap_around()914   bool wrap_around() const { return wrap_around_; }
segment_size()915   int32 segment_size() const { return segment_size_; }
916 
917  private:
918   float bias_;
919   int32 range_;
920   float alpha_;
921   float beta_;
922   bool wrap_around_;
923   int32 segment_size_;
924 };
925 
926 // Returns a string representation of the given activation mode.
927 std::string ActivationModeString(ActivationMode mode);
928 
929 // Describes the operation that DoElementwiseOperation should perform on its
930 // inputs.
931 enum class ElementwiseOperation { kAdd, kMultiply };
932 
933 std::string ElementwiseOperationString(ElementwiseOperation op);
934 
935 // A simple class representing the version of the backing library, to
936 // workaround the "too perfect forwarding" issue in gcc6+ compilers.
937 // See PR#16309 and issue #18402 for links discussing the issue.
938 class VersionInfo {
939  public:
940   VersionInfo(int major = 0, int minor = 0, int patch = 0)
major_(major)941       : major_(major), minor_(minor), patch_(patch) {}
major_version()942   int major_version() const { return major_; }
minor_version()943   int minor_version() const { return minor_; }
patch()944   int patch() const { return patch_; }
945 
946  private:
947   int major_;
948   int minor_;
949   int patch_;
950 };
951 
952 // Suite of operations typically used for implementing Deep/Convolutional Neural
953 // Nets. Note: A false return value of an operation indicates the
954 // implementation is not available.
955 //
956 // TODO(b/118763918): this class (or rather dispatch table) has several
957 // problems:
958 // * Some overloads are missing. Ideally we want to have template virtual
959 //   functions while the template arguments is a closed set. However, we don't
960 //   get that from the language.
961 // * The API is a union of cuDNN and another private backend. Only 10% of the
962 //   functions are actually implemented by both backends, the rest are
963 //   actually backend-specific. The massive interface creates extra mental
964 //   burden.
965 // * Poor error handling: the API should return Status objects.
966 //
967 // PrepareForConvolution is an example for how new APIs should be written.
968 class DnnSupport {
969  public:
DnnSupport()970   DnnSupport() {}
~DnnSupport()971   virtual ~DnnSupport() {}
972 
973   virtual port::Status Init() = 0;
974 
975   // Gets the version of the backing library, as a VersionInfo object.
GetVersion()976   virtual port::StatusOr<VersionInfo> GetVersion() {
977     return port::UnimplementedError(
978         "DnnSupport::GetVersion not implemented on this platform.");
979   }
980 
981   // Performs a single-precision forward batch normalization operation onto
982   // the stream.
983   //
984   // Arguments:
985   //  stream: borrowed pointer to the stream that the batch normalization
986   //    operation should be enqueued onto.
987   //  x: input data.
988   //  scale: scaling parameters.
989   //  offset: offset parameters.
990   //  estimated_mean: population mean estimated during training.
991   //    Used for inference only; empty for training.
992   //  estimated_variance: population variance estimated during training,
993   //    used for inference only; empty for training.
994   //  side_input: optional input that is element-wise added to the output of
995   //    batch normalization.
996   //  x_desc: dimensions of the input data, which is the same as the dimensions
997   //    of the output and side input.
998   //  scale_offset_desc: dimensions of scale and offset.
999   //  epsilon: a small floating point number added to the variance of x.
1000   //  activation_mode: activation applied to the result of batch normalization
1001   //    (or after adding optional side input)
1002   //  y: output data.
1003   //  batch_mean: batch mean, to be used to compute the running mean.
1004   //  batch_variance: batch variance, to be used to compute
1005   //    the running variance.
1006   //  reserve_space_1: saved mean, to be reused in the backward gradient
1007   //    computation.
1008   //  reserve_space_2: saved inv_var (1/sqrt(epsilon + variance), to be reused
1009   //    in the backward gradient computation.
1010   //  is_training: Set to true for training, false for inference.
DoBatchNormalizationForward(Stream * stream,const DeviceMemory<float> & x,const DeviceMemory<float> & scale,const DeviceMemory<float> & offset,const DeviceMemory<float> & estimated_mean,const DeviceMemory<float> & estimated_variance,const DeviceMemory<float> & side_input,const dnn::BatchDescriptor & x_desc,const dnn::BatchDescriptor & scale_offset_desc,const double epsilon,const double exponential_average_factor,dnn::ActivationMode activation_mode,DeviceMemory<float> * y,DeviceMemory<float> * batch_mean,DeviceMemory<float> * batch_var,DeviceMemory<float> * reserve_space_1,DeviceMemory<float> * reserve_space_2,bool is_training,ScratchAllocator * reserve_space_allocator,ScratchAllocator * workspace_allocator)1011   virtual bool DoBatchNormalizationForward(
1012       Stream* stream, const DeviceMemory<float>& x,
1013       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
1014       const DeviceMemory<float>& estimated_mean,
1015       const DeviceMemory<float>& estimated_variance,
1016       const DeviceMemory<float>& side_input, const dnn::BatchDescriptor& x_desc,
1017       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1018       const double exponential_average_factor,
1019       dnn::ActivationMode activation_mode, DeviceMemory<float>* y,
1020       DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
1021       DeviceMemory<float>* reserve_space_1,
1022       DeviceMemory<float>* reserve_space_2, bool is_training,
1023       ScratchAllocator* reserve_space_allocator,
1024       ScratchAllocator* workspace_allocator) {
1025     return false;
1026   }
1027 
1028   // Performs a half-precision forwards batch normalization operation onto the
1029   // stream. See DoBatchNormalizationForward above for argument details.
DoBatchNormalizationForward(Stream * stream,const DeviceMemory<Eigen::half> & x,const DeviceMemory<float> & scale,const DeviceMemory<float> & offset,const DeviceMemory<float> & estimated_mean,const DeviceMemory<float> & estimated_variance,const DeviceMemory<float> & side_input,const dnn::BatchDescriptor & x_desc,const dnn::BatchDescriptor & scale_offset_desc,const double epsilon,const double exponential_average_factor,dnn::ActivationMode activation_mode,DeviceMemory<Eigen::half> * y,DeviceMemory<float> * batch_mean,DeviceMemory<float> * batch_var,DeviceMemory<float> * reserve_space_1,DeviceMemory<float> * reserve_space_2,bool is_training,ScratchAllocator * reserve_space_allocator,ScratchAllocator * workspace_allocator)1030   virtual bool DoBatchNormalizationForward(
1031       Stream* stream, const DeviceMemory<Eigen::half>& x,
1032       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
1033       const DeviceMemory<float>& estimated_mean,
1034       const DeviceMemory<float>& estimated_variance,
1035       const DeviceMemory<float>& side_input, const dnn::BatchDescriptor& x_desc,
1036       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1037       const double exponential_average_factor,
1038       dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y,
1039       DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
1040       DeviceMemory<float>* reserve_space_1,
1041       DeviceMemory<float>* reserve_space_2, bool is_training,
1042       ScratchAllocator* reserve_space_allocator,
1043       ScratchAllocator* workspace_allocator) {
1044     return false;
1045   }
1046 
1047   // Performs a single-precision backward batch normalization gradient
1048   // computation operation onto the stream.
1049   //
1050   // Arguments:
1051   //  stream: borrowed pointer to the stream that the batch normalization
1052   //    gradient computation operation should be enqueued onto.
1053   //  y_backprop: gradient with regard to output y.
1054   //  x: input data.
1055   //  scale: scaling parameters.
1056   //  inv_var: 1/sqrt(epsilon + variance) of x.
1057   //  x_desc: dimensions of the input data, which is the same as the dimensions
1058   //    of the output.
1059   //  scale_offset_desc: dimensions of scale and offset.
1060   //  epsilon: a small floating point number added to the variance of x.
1061   //  x_backprop: gradient with respect to input x.
1062   //  scale_backprop: gradient with respect to scale.
1063   //  offset_backprop: gradient with respect to offset.
DoBatchNormalizationBackward(Stream * stream,const DeviceMemory<float> & y_backprop,const DeviceMemory<float> & x,const DeviceMemory<float> & scale,const DeviceMemory<float> & mean,const DeviceMemory<float> & inv_var,const dnn::BatchDescriptor & x_desc,const dnn::BatchDescriptor & scale_offset_desc,const double epsilon,DeviceMemory<float> * x_backprop,DeviceMemory<float> * scale_backprop,DeviceMemory<float> * offset_backprop,DeviceMemory<uint8> * reserve_space_data,ScratchAllocator * workspace_allocator)1064   virtual bool DoBatchNormalizationBackward(
1065       Stream* stream, const DeviceMemory<float>& y_backprop,
1066       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
1067       const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
1068       const dnn::BatchDescriptor& x_desc,
1069       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1070       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
1071       DeviceMemory<float>* offset_backprop,
1072       DeviceMemory<uint8>* reserve_space_data,
1073       ScratchAllocator* workspace_allocator) {
1074     return false;
1075   }
1076 
1077   // Performs a half-precision backward batch normalization gradient computation
1078   // operation onto the stream. See DoBatchNormalizationBackward above for
1079   // argument details.
DoBatchNormalizationBackward(Stream * stream,const DeviceMemory<Eigen::half> & y_backprop,const DeviceMemory<Eigen::half> & x,const DeviceMemory<float> & scale,const DeviceMemory<float> & mean,const DeviceMemory<float> & inv_var,const dnn::BatchDescriptor & x_desc,const dnn::BatchDescriptor & scale_offset_desc,const double epsilon,DeviceMemory<Eigen::half> * x_backprop,DeviceMemory<float> * scale_backprop,DeviceMemory<float> * offset_backprop,DeviceMemory<uint8> * reserve_space_data,ScratchAllocator * workspace_allocator)1080   virtual bool DoBatchNormalizationBackward(
1081       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
1082       const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
1083       const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
1084       const dnn::BatchDescriptor& x_desc,
1085       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1086       DeviceMemory<Eigen::half>* x_backprop,
1087       DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
1088       DeviceMemory<uint8>* reserve_space_data,
1089       ScratchAllocator* workspace_allocator) {
1090     return false;
1091   }
1092 
1093   // Enqueues a fused convolution operation onto the stream.
1094   // We provide several variants with different types for inputs, biases and
1095   // scaling parameters.
1096   //
1097   // Arguments (all borrowed):
1098   //  stream: borrowed pointer to the stream that the 'convolve' operation
1099   //    should be enqueued onto.
1100   //  conv_input_descriptor: dimensions of the convolution input layer.
1101   //  conv_input_data: un-owned device memory region which contains the
1102   //    convolution input.
1103   //  conv_input_scale: a floating point scale to multiply with each element
1104   //    of conv_input_data.
1105   //  filter_descriptor: dimensions of the convolution filter.
1106   //  filter_data: un-owned device memory region which contains the
1107   //    convolution filter weights.
1108   //  convolution_descriptor: stride of the convolution filter.
1109   //  biases: un-owned device memory region containing biases to add to the
1110   //    input.
1111   //  activation_mode: Type of activation to perform.
1112   //  side_input_data: un-owned device memory region which contains optional
1113   //    side input data. If 'side_input_scale' is non-zero, then this must
1114   //    point to data in the tensor shape specified by output_shape.
1115   //    It will be scaled by 'side_input_scale' and added to the convolution
1116   //    result and bias prior to applying the activation function.
1117   //  side_input_scale: a floating point scale to multiply with each element
1118   //    of side_input_data.
1119   //  output_descriptor: dimensions of the output layer.
1120   //  output_data: un-owned device memory region in which to place the
1121   //    convolution result.
1122   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
1123   //    space in order to speed up the convolution operation.
1124   //  algorithm_config: specifies which algorithm should be used for the
1125   //    operation.
1126   //  output_profile_result: the output profile result for this call. The
1127   //    profiling is only enabled when this is not nullptr.
1128   //
1129   // conv_input_descriptor, filter_descriptor, convolution_descriptor and
1130   // output_descriptor together specify exactly how the convolution is aligned
1131   // with the input data:
1132   //
1133   // * (input dimensions - filter size + 1) / filter stride == output dimensions
1134   //   corresponds to dist_belief padding = VALID, i.e. the input is not padded.
1135   // * input dimensions / filter stride == output dimensions
1136   //   corresponds to dist_belief padding = SAME, i.e. input and output are the
1137   //   same size - this requires padding the input.
1138   // * (input dimensions + filter size - 1) / filter stride == output dimensions
1139   //   corresponds to dist_belief padding = FULL, i.e. the output is sized so
1140   //   that if the inverse of the filter is applied to the output in VALID mode
1141   //   the result is the same size as the input - this requires even more
1142   //   padding of the input.
DoFusedConvolve(Stream * stream,const dnn::BatchDescriptor & conv_input_descriptor,const DeviceMemory<double> & conv_input_data,double conv_input_scale,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<double> & filter_data,const dnn::ConvolutionDescriptor & convolution_descriptor,const DeviceMemory<double> & side_input_data,double side_input_scale,const dnn::BatchDescriptor & bias_descriptor,const DeviceMemory<double> & biases,dnn::ActivationMode activation_mode,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<double> * output_data,ScratchAllocator * scratch_allocator,const dnn::AlgorithmConfig & algorithm_config,dnn::ProfileResult * output_profile_result)1143   virtual port::Status DoFusedConvolve(
1144       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1145       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
1146       const dnn::FilterDescriptor& filter_descriptor,
1147       const DeviceMemory<double>& filter_data,
1148       const dnn::ConvolutionDescriptor& convolution_descriptor,
1149       const DeviceMemory<double>& side_input_data, double side_input_scale,
1150       const dnn::BatchDescriptor& bias_descriptor,
1151       const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
1152       const dnn::BatchDescriptor& output_descriptor,
1153       DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
1154       const dnn::AlgorithmConfig& algorithm_config,
1155       dnn::ProfileResult* output_profile_result) {
1156     return port::UnimplementedError(
1157         "DnnSupport::DoFusedConvolve not implemented on this platform.");
1158   }
1159 
1160   // This is the float version of DoFusedConvolve.
DoFusedConvolve(Stream * stream,const dnn::BatchDescriptor & conv_input_descriptor,const DeviceMemory<float> & conv_input_data,float conv_input_scale,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<float> & filter_data,const dnn::ConvolutionDescriptor & convolution_descriptor,const DeviceMemory<float> & side_input_data,float side_input_scale,const dnn::BatchDescriptor & bias_descriptor,const DeviceMemory<float> & biases,dnn::ActivationMode activation_mode,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<float> * output_data,ScratchAllocator * scratch_allocator,const dnn::AlgorithmConfig & algorithm_config,dnn::ProfileResult * output_profile_result)1161   virtual port::Status DoFusedConvolve(
1162       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1163       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
1164       const dnn::FilterDescriptor& filter_descriptor,
1165       const DeviceMemory<float>& filter_data,
1166       const dnn::ConvolutionDescriptor& convolution_descriptor,
1167       const DeviceMemory<float>& side_input_data, float side_input_scale,
1168       const dnn::BatchDescriptor& bias_descriptor,
1169       const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
1170       const dnn::BatchDescriptor& output_descriptor,
1171       DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
1172       const dnn::AlgorithmConfig& algorithm_config,
1173       dnn::ProfileResult* output_profile_result) {
1174     return port::UnimplementedError(
1175         "DnnSupport::DoFusedConvolve not implemented on this platform.");
1176   }
1177 
1178   // This is the Eigen::half version of DoFusedConvolve.
1179   // The scaling parameters are still floats.
DoFusedConvolve(Stream * stream,const dnn::BatchDescriptor & conv_input_descriptor,const DeviceMemory<Eigen::half> & conv_input_data,float conv_input_scale,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<Eigen::half> & filter_data,const dnn::ConvolutionDescriptor & convolution_descriptor,const DeviceMemory<Eigen::half> & side_input_data,float side_input_scale,const dnn::BatchDescriptor & bias_descriptor,const DeviceMemory<Eigen::half> & biases,dnn::ActivationMode activation_mode,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<Eigen::half> * output_data,ScratchAllocator * scratch_allocator,const dnn::AlgorithmConfig & algorithm_config,dnn::ProfileResult * output_profile_result)1180   virtual port::Status DoFusedConvolve(
1181       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1182       const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
1183       const dnn::FilterDescriptor& filter_descriptor,
1184       const DeviceMemory<Eigen::half>& filter_data,
1185       const dnn::ConvolutionDescriptor& convolution_descriptor,
1186       const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
1187       const dnn::BatchDescriptor& bias_descriptor,
1188       const DeviceMemory<Eigen::half>& biases,
1189       dnn::ActivationMode activation_mode,
1190       const dnn::BatchDescriptor& output_descriptor,
1191       DeviceMemory<Eigen::half>* output_data,
1192       ScratchAllocator* scratch_allocator,
1193       const dnn::AlgorithmConfig& algorithm_config,
1194       dnn::ProfileResult* output_profile_result) {
1195     return port::UnimplementedError(
1196         "DnnSupport::DoFusedConvolve not implemented on this platform.");
1197   }
1198 
1199   // This is the int8 version of DoFusedConvolve.
1200   // The bias input and scaling parameters are floats.
DoFusedConvolve(Stream * stream,const dnn::BatchDescriptor & conv_input_descriptor,const DeviceMemory<int8> & conv_input_data,float conv_input_scale,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<int8> & filter_data,const dnn::ConvolutionDescriptor & convolution_descriptor,const DeviceMemory<int8> & side_input_data,float side_input_scale,const dnn::BatchDescriptor & bias_descriptor,const DeviceMemory<float> & biases,dnn::ActivationMode activation_mode,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<int8> * output_data,ScratchAllocator * scratch_allocator,const dnn::AlgorithmConfig & algorithm_config,dnn::ProfileResult * output_profile_result)1201   virtual port::Status DoFusedConvolve(
1202       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1203       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
1204       const dnn::FilterDescriptor& filter_descriptor,
1205       const DeviceMemory<int8>& filter_data,
1206       const dnn::ConvolutionDescriptor& convolution_descriptor,
1207       const DeviceMemory<int8>& side_input_data, float side_input_scale,
1208       const dnn::BatchDescriptor& bias_descriptor,
1209       const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
1210       const dnn::BatchDescriptor& output_descriptor,
1211       DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
1212       const dnn::AlgorithmConfig& algorithm_config,
1213       dnn::ProfileResult* output_profile_result) {
1214     return port::UnimplementedError(
1215         "DnnSupport::DoFusedConvolve not implemented on this platform.");
1216   }
1217 
1218   // This is the int8 version of DoFusedConvolve.
1219   // The output, bias input and scaling parameters are floats.
DoFusedConvolve(Stream *,const dnn::BatchDescriptor &,const DeviceMemory<int8> &,float,const dnn::FilterDescriptor &,const DeviceMemory<int8> &,const dnn::ConvolutionDescriptor &,const DeviceMemory<float> &,float,const dnn::BatchDescriptor &,const DeviceMemory<float> &,dnn::ActivationMode,const dnn::BatchDescriptor &,DeviceMemory<float> *,ScratchAllocator *,const dnn::AlgorithmConfig &,dnn::ProfileResult *)1220   virtual port::Status DoFusedConvolve(
1221       Stream* /*stream*/, const dnn::BatchDescriptor& /*conv_input_descriptor*/,
1222       const DeviceMemory<int8>& /*conv_input_data*/, float /*conv_input_scale*/,
1223       const dnn::FilterDescriptor& /*filter_descriptor*/,
1224       const DeviceMemory<int8>& /*filter_data*/,
1225       const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
1226       const DeviceMemory<float>& /*side_input_data*/,
1227       float /*side_input_scale*/,
1228       const dnn::BatchDescriptor& /*bias_descriptor*/,
1229       const DeviceMemory<float>& /*biases*/,
1230       dnn::ActivationMode /*activation_mode*/,
1231       const dnn::BatchDescriptor& /*output_descriptor*/,
1232       DeviceMemory<float>* /*output_data*/,
1233       ScratchAllocator* /*scratch_allocator*/,
1234       const dnn::AlgorithmConfig& /*algorithm_config*/,
1235       dnn::ProfileResult* /*output_profile_result*/) {
1236     return port::UnimplementedError(
1237         "DnnSupport::DoFusedConvolve not implemented on this platform.");
1238   }
1239 
1240   template <typename ElementType, typename OutputType>
PrepareForConvolution(ConvolutionKind kind,Stream * stream,const BatchDescriptor & batch_descriptor,DeviceMemory<ElementType> input_data,const FilterDescriptor & filter_descriptor,DeviceMemory<ElementType> filter_data,const BatchDescriptor & output_descriptor,DeviceMemory<OutputType> output_data,const ConvolutionDescriptor & convolution_descriptor,const AlgorithmConfig & algorithm_config,ScratchAllocator * scratch_allocator,AlgorithmDesc * algorithm_desc,DeviceMemory<uint8> * scratch_memory)1241   port::Status PrepareForConvolution(
1242       ConvolutionKind kind, Stream* stream,
1243       const BatchDescriptor& batch_descriptor,
1244       DeviceMemory<ElementType> input_data,
1245       const FilterDescriptor& filter_descriptor,
1246       DeviceMemory<ElementType> filter_data,
1247       const BatchDescriptor& output_descriptor,
1248       DeviceMemory<OutputType> output_data,
1249       const ConvolutionDescriptor& convolution_descriptor,
1250       const AlgorithmConfig& algorithm_config,
1251       ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
1252       DeviceMemory<uint8>* scratch_memory) {
1253     return DoPrepareForConvolution(
1254         kind, ToDataType<ElementType>::value, stream, batch_descriptor,
1255         input_data, filter_descriptor, filter_data, output_descriptor,
1256         output_data, convolution_descriptor, algorithm_config,
1257         scratch_allocator, algorithm_desc, scratch_memory);
1258   }
1259 
1260   // Enqueues a single-precision convolution operation onto the stream.
1261   //
1262   // Arguments (all borrowed):
1263   //  stream: borrowed pointer to the stream that the 'convolve' operation
1264   //    should be enqueued onto.
1265   //  input_descriptor: dimensions of the input layer.
1266   //  input_data: un-owned device memory region which contains the
1267   //    convolution input.
1268   //  filter_descriptor: dimensions of the convolution filter.
1269   //  convolution_descriptor: stride of the convolution filter.
1270   //  output_descriptor: dimensions of the output layer.
1271   //  output_data: un-owned device memory region in which to place the
1272   //    convolution result.
1273   //  algorithm_desc: specifies which algorithm should be used for the
1274   //    operation.
1275   //  scratch: un-owned device memory for scratch space in order to speed up
1276   //    the convolution operation.
1277   //  output_profile_result: the output profile result for this call. The
1278   //    profiling is only enabled when this is not nullptr.
1279   //
1280   // input_descriptor, filter_descriptor, convolution_descriptor and
1281   // output_descriptor together specify exactly how the convolution is aligned
1282   // with the input data:
1283   //
1284   // * (input dimensions - filter size + 1) / filter stride == output dimensions
1285   //   corresponds to dist_belief padding = VALID, i.e. the input is not padded.
1286   // * input dimensions / filter stride == output dimensions
1287   //   corresponds to dist_belief padding = SAME, i.e. input and output are the
1288   //   same size - this requires padding the input.
1289   // * (input dimensions + filter size - 1) / filter stride == output dimensions
1290   //   corresponds to dist_belief padding = FULL, i.e. the output is sized so
1291   //   that if the inverse of the filter is applied to the output in VALID mode
1292   //   the result is the same size as the input - this requires even more
1293   //   padding of the input.
1294   virtual port::Status DoConvolve(
1295       ConvolutionKind kind, DataType element_type, DataType output_type,
1296       Stream* stream, const BatchDescriptor& input_descriptor,
1297       DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor,
1298       DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
1299       DeviceMemoryBase output_data,
1300       const ConvolutionDescriptor& convolution_descriptor,
1301       AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
1302       ProfileResult* output_profile_result) = 0;
1303 
1304   template <typename ElementType, typename OutputType>
DoConvolve(Stream * stream,const dnn::BatchDescriptor & input_descriptor,const DeviceMemory<ElementType> & input_data,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<ElementType> & filter_data,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<OutputType> * output_data,const dnn::AlgorithmDesc & algorithm_desc,DeviceMemory<uint8> * scratch_memory,ProfileResult * output_profile_result)1305   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1306                   const DeviceMemory<ElementType>& input_data,
1307                   const dnn::FilterDescriptor& filter_descriptor,
1308                   const DeviceMemory<ElementType>& filter_data,
1309                   const dnn::ConvolutionDescriptor& convolution_descriptor,
1310                   const dnn::BatchDescriptor& output_descriptor,
1311                   DeviceMemory<OutputType>* output_data,
1312                   const dnn::AlgorithmDesc& algorithm_desc,
1313                   DeviceMemory<uint8>* scratch_memory,
1314                   ProfileResult* output_profile_result) {
1315     return IsStatusOk(
1316         DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value,
1317                    ToDataType<OutputType>::value, stream, input_descriptor,
1318                    input_data, filter_descriptor, filter_data,
1319                    output_descriptor, *output_data, convolution_descriptor,
1320                    algorithm_desc, *scratch_memory, output_profile_result),
1321         !output_profile_result);
1322   }
1323 
1324   // Return a list of algorithms supported by the forward convolution pass.
1325   // cc_major and cc_minor are the compute capabilities of the device.
1326   virtual bool GetConvolveAlgorithms(
1327       bool with_winograd_nonfused, int cc_major, int cc_minor,
1328       std::vector<AlgorithmDesc>* out_algorithms);
1329 
1330   virtual bool GetMIOpenConvolveAlgorithms(
1331       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
1332       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
1333       const dnn::FilterDescriptor& filter_descriptor,
1334       DeviceMemoryBase filter_data,
1335       const dnn::BatchDescriptor& output_descriptor,
1336       DeviceMemoryBase output_data,
1337       const dnn::ConvolutionDescriptor& convolution_descriptor,
1338       ScratchAllocator* scratch_allocator,
1339       std::vector<ProfileResult>* out_algorithms);
1340 
1341   // Returns a list of supported rnn algorithms.
1342   virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
1343 
1344   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
1345   // coefficient_scales specifies the scaling of each column of coefficients:
1346   // original float coefficient[row * num_columns + column] =
1347   //     quantized coefficient[row * num_columns + column] *
1348   //     coefficient_scales[column].
1349   virtual bool DoConvolveQuantized(
1350       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1351       const DeviceMemory<float>& input_data,
1352       const dnn::FilterDescriptor& filter_descriptor,
1353       const DeviceMemory<int8>& filter_coefficients,
1354       const DeviceMemory<float>& coefficient_scales,
1355       const dnn::ConvolutionDescriptor& convolution_descriptor,
1356       const dnn::BatchDescriptor& output_descriptor,
1357       DeviceMemory<float>* output_data) = 0;
1358 
1359   // Same as DoConvolveQuantized above, but int8 filter coefficients.
1360   virtual bool DoConvolveQuantized(
1361       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1362       const DeviceMemory<float>& input_data,
1363       const dnn::FilterDescriptor& filter_descriptor,
1364       const DeviceMemory<int16>& filter_coefficients,
1365       const DeviceMemory<float>& coefficient_scales,
1366       const dnn::ConvolutionDescriptor& convolution_descriptor,
1367       const dnn::BatchDescriptor& output_descriptor,
1368       DeviceMemory<float>* output_data) = 0;
1369 
1370   // Variation of the above with the weight matrix split into two matrices.
1371   // first_weights: Coefficients of the first matrix.
1372   // second_weights: Coefficients of the second matrix.
1373   // depth_multiplier: specifies the columns of the first matrix and rows
1374   // of the second one - first_weights columns = depth_multiplier,
1375   // second_weights rows = depth_multiplier *
1376   //                       filter_descriptor.input_feature_map_count().
1377   // see go/separable for documentation on separable convolutions.
1378   virtual bool DoSeparableConvolve(
1379       Stream* stream, const BatchDescriptor& input_descriptor,
1380       const DeviceMemory<float>& input_data,
1381       const FilterDescriptor& filter_descriptor, int depth_multiplier,
1382       const DeviceMemory<float>& first_weights,
1383       const DeviceMemory<float>& second_weights,
1384       const ConvolutionDescriptor& convolution_descriptor,
1385       const BatchDescriptor& output_descriptor,
1386       DeviceMemory<float>* output_data) = 0;
1387 
1388   // Enqueues a single-precision backward convolution (for data) operation onto
1389   // the stream.
1390   //
1391   // Arguments:
1392   //  stream: borrowed pointer to the stream that the 'convolve' operation
1393   //    should be enqueued onto.
1394   //  filter_descriptor: dimensions of the convolution filter.
1395   //  filter_data: coefficients for the convolution filter.
1396   //  output_descriptor: dimensions of the output gradients, which is the same
1397   //    as the dimensions of the output.
1398   //  backward_output_data: un-owned device memory region which contains the
1399   //    backprop of the output.
1400   //  convolution_descriptor: stride of the convolution filter.
1401   //  input_descriptor: dimensions of the input layer.
1402   //  backward_input_data: un-owned device memory region in which to place the
1403   //    backprop of the input.
1404   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
1405   //    space in order to speed up the convolution operation.
1406   template <typename ElementType>
DoConvolveBackwardData(Stream * stream,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<ElementType> & filter_data,const dnn::BatchDescriptor & output_descriptor,const DeviceMemory<ElementType> & backward_output_data,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & input_descriptor,DeviceMemory<ElementType> * backward_input_data,const dnn::AlgorithmDesc & algorithm_desc,DeviceMemory<uint8> * scratch_memory,ProfileResult * output_profile_result)1407   bool DoConvolveBackwardData(
1408       Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
1409       const DeviceMemory<ElementType>& filter_data,
1410       const dnn::BatchDescriptor& output_descriptor,
1411       const DeviceMemory<ElementType>& backward_output_data,
1412       const dnn::ConvolutionDescriptor& convolution_descriptor,
1413       const dnn::BatchDescriptor& input_descriptor,
1414       DeviceMemory<ElementType>* backward_input_data,
1415       const dnn::AlgorithmDesc& algorithm_desc,
1416       DeviceMemory<uint8>* scratch_memory,
1417       ProfileResult* output_profile_result) {
1418     return IsStatusOk(
1419         DoConvolve(
1420             ConvolutionKind::BACKWARD_DATA, ToDataType<ElementType>::value,
1421             ToDataType<ElementType>::value, stream, input_descriptor,
1422             *backward_input_data, filter_descriptor, filter_data,
1423             output_descriptor, backward_output_data, convolution_descriptor,
1424             algorithm_desc, *scratch_memory, output_profile_result),
1425         !output_profile_result);
1426   }
1427 
1428   // Return a list of algorithms supported by the backward convolution pass for
1429   // data.
1430   virtual bool GetConvolveBackwardDataAlgorithms(
1431       bool with_winograd_nonfused, int cc_major, int cc_minor,
1432       std::vector<AlgorithmDesc>* out_algorithms);
1433 
1434   // Enqueues a single-precision backward convolution (for filter) operation
1435   // onto the stream.
1436   //
1437   // Arguments:
1438   //  stream: borrowed pointer to the stream that the 'convolve' operation
1439   //    should be enqueued onto.
1440   //  input_descriptor: dimensions of the input layer.
1441   //  input_data: un-owned device memory region which contains the
1442   //    convolution input.
1443   //  output_descriptor: dimensions of the output gradients, which is the same
1444   //    as the dimensions of the output.
1445   //  backward_output_data: un-owned device memory region which contains the
1446   //    backprop of the output.
1447   //  convolution_descriptor: stride of the convolution filter.
1448   //  filter_descriptor: dimensions of the convolution filter.
1449   //  backward_filter_data: un-owned device memory region in which to place the
1450   //    backprop of the filter.
1451   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
1452   //    space in order to speed up the convolution operation.
1453   template <typename ElementType>
DoConvolveBackwardFilter(Stream * stream,const BatchDescriptor & input_descriptor,const DeviceMemory<ElementType> & input_data,const BatchDescriptor & output_descriptor,const DeviceMemory<ElementType> & backward_output_data,const ConvolutionDescriptor & convolution_descriptor,const FilterDescriptor & filter_descriptor,DeviceMemory<ElementType> * backward_filter_data,const dnn::AlgorithmDesc & algorithm_desc,DeviceMemory<uint8> * scratch_memory,ProfileResult * output_profile_result)1454   bool DoConvolveBackwardFilter(
1455       Stream* stream, const BatchDescriptor& input_descriptor,
1456       const DeviceMemory<ElementType>& input_data,
1457       const BatchDescriptor& output_descriptor,
1458       const DeviceMemory<ElementType>& backward_output_data,
1459       const ConvolutionDescriptor& convolution_descriptor,
1460       const FilterDescriptor& filter_descriptor,
1461       DeviceMemory<ElementType>* backward_filter_data,
1462       const dnn::AlgorithmDesc& algorithm_desc,
1463       DeviceMemory<uint8>* scratch_memory,
1464       ProfileResult* output_profile_result) {
1465     return IsStatusOk(
1466         DoConvolve(
1467             ConvolutionKind::BACKWARD_FILTER, ToDataType<ElementType>::value,
1468             ToDataType<ElementType>::value, stream, input_descriptor,
1469             input_data, filter_descriptor, *backward_filter_data,
1470             output_descriptor, backward_output_data, convolution_descriptor,
1471             algorithm_desc, *scratch_memory, output_profile_result),
1472         !output_profile_result);
1473   }
1474 
1475   // Return a list of algorithms supported by the backward convolution pass for
1476   // filters.
1477   virtual bool GetConvolveBackwardFilterAlgorithms(
1478       bool with_winograd_nonfused, int cc_major, int cc_minor,
1479       std::vector<AlgorithmDesc>* out_algorithms);
1480 
1481   // Enqueues a single-precision backward convolution (for bias) operation onto
1482   // the stream.
1483   //
1484   // Arguments:
1485   //  stream: borrowed pointer to the stream that the 'convolve' operation
1486   //    should be enqueued onto.
1487   //  input_descriptor: dimensions of the input layer.
1488   //  input_data: un-owned device memory region which contains the
1489   //    convolution input.
1490   //  bias_descriptor: dimensions of the bias tensor. Should be the same as the
1491   //    input dimensions, but with the spatial dimensions set to 1.
1492   //  backward_filter_data: un-owned device memory region in which to place the
1493   //    backprop of the bias.
DoConvolveBackwardBias(Stream * stream,const BatchDescriptor & input_descriptor,const DeviceMemory<float> & input_data,const BatchDescriptor & bias_descriptor,DeviceMemory<float> * backward_bias_data)1494   virtual bool DoConvolveBackwardBias(Stream* stream,
1495                                       const BatchDescriptor& input_descriptor,
1496                                       const DeviceMemory<float>& input_data,
1497                                       const BatchDescriptor& bias_descriptor,
1498                                       DeviceMemory<float>* backward_bias_data) {
1499     return false;
1500   }
1501 
DoConvolveBackwardBias(Stream * stream,const BatchDescriptor & input_descriptor,const DeviceMemory<double> & input_data,const BatchDescriptor & bias_descriptor,DeviceMemory<double> * backward_bias_data)1502   virtual bool DoConvolveBackwardBias(
1503       Stream* stream, const BatchDescriptor& input_descriptor,
1504       const DeviceMemory<double>& input_data,
1505       const BatchDescriptor& bias_descriptor,
1506       DeviceMemory<double>* backward_bias_data) {
1507     return false;
1508   }
1509 
DoConvolveBackwardBias(Stream * stream,const BatchDescriptor & input_descriptor,const DeviceMemory<Eigen::half> & input_data,const BatchDescriptor & bias_descriptor,DeviceMemory<Eigen::half> * backward_bias_data)1510   virtual bool DoConvolveBackwardBias(
1511       Stream* stream, const BatchDescriptor& input_descriptor,
1512       const DeviceMemory<Eigen::half>& input_data,
1513       const BatchDescriptor& bias_descriptor,
1514       DeviceMemory<Eigen::half>* backward_bias_data) {
1515     return false;
1516   }
1517 
1518   // Fully connects the "nodes" (float values) in input_data with
1519   // shape input_dimensions to output_data with output_dimensions
1520   // using provided weights. This is equivalent to computing a matrix
1521   // product, hence the name MatMul.
1522   //
1523   // A BatchDescriptor has four dimensions: batch, y, x, depth. Matrix products
1524   // happen in two dimensions. To get down to two dimensions, we consider the
1525   // input y, x and depth dimension as one combined dimension T. For now,
1526   // assume that the output height and width are 1 and let OD be the output
1527   // depth.
1528   //
1529   // There are three device memory buffers passed in to this
1530   // function. We can now view all three as matrices:
1531   //
1532   //   input_data: A batch x T matrix
1533   //   weights: A T x OD matrix
1534   //   output_data: A batch x OD matrix
1535   //
1536   // This function then computes the matrix product of input_data and
1537   // weights and writes the result into output_data.
1538   //
1539   // Here the weights buffer is in row major order, i.e. the first OD
1540   // entries in weights are the first row, the second OD entries in
1541   // weights are the second row and so on.
1542   //
1543   // The case for output width*height > 1 is more complicated. Let K =
1544   // OY * OX where OY is the output height and OX is the output
1545   // width. Then weights is divided into K sub-arrays W_i, for
1546   // i=0,...,k-1, that each represent a T x OD matrix. This function
1547   // then computes the K matrix multiplications of input_data with
1548   // each W_i. This creates K matrices with dimensions batch x
1549   // OD. These K matrices are concatenated horizontally to form one
1550   // larger matrix with dimensions batch x (K*OD); note that this is
1551   // not the same as concatenating the bytes of the matrices. The
1552   // combined matrix can then be interpreted as a tensor with
1553   // dimensions (batch, OY, OX, OD). If the output tensor format is
1554   // not kBatchYXDepth, this function would then need to arrange for
1555   // the output to be in the requested layout, if that is
1556   // supported. Note that the case K=1 is equivalent to the
1557   // description above. It is recommended to prefer the case K=1.
1558   //
1559   // Arguments (all borrowed):
1560   //  stream: borrowed pointer to the stream that the 'fully connect' operation
1561   //    should be enqueued onto.
1562   //  output_data: un-owned device memory region in which to place the
1563   //    fully connected result.
1564   virtual bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data,
1565                         const DeviceMemory<float>& weights,
1566                         const dnn::BatchDescriptor& input_dimensions,
1567                         const dnn::BatchDescriptor& output_dimensions,
1568                         DeviceMemory<float>* output_data) = 0;
1569 
1570   // Version of DoMatMul that uses pre-quantized 8 bit weights.
1571   // weight_scales specifies the scaling of each column of weights:
1572   // original float weight[row * num_columns + column] =
1573   //     quantized_weight[row * nnum_columns + column] * weight_scales[column].
1574   virtual bool DoMatMulQuantized(Stream* stream,
1575                                  const DeviceMemory<float>& input_data,
1576                                  const DeviceMemory<int8>& quantized_weights,
1577                                  const DeviceMemory<float>& weight_scales,
1578                                  const dnn::BatchDescriptor& input_dimensions,
1579                                  const dnn::BatchDescriptor& output_dimensions,
1580                                  DeviceMemory<float>* output_data) = 0;
1581 
1582   // Version of DoMatMul that uses pre-quantized 16 bit weights.
1583   // weight_scales specifies the scaling of each column of weights:
1584   // original float weight[row * num_columns + column] =
1585   //     quantized_weight[row * nnum_columns + column] * weight_scales[column].
1586   virtual bool DoMatMulQuantized(Stream* stream,
1587                                  const DeviceMemory<float>& input_data,
1588                                  const DeviceMemory<int16>& quantized_weights,
1589                                  const DeviceMemory<float>& weight_scales,
1590                                  const dnn::BatchDescriptor& input_dimensions,
1591                                  const dnn::BatchDescriptor& output_dimensions,
1592                                  DeviceMemory<float>* output_data) = 0;
1593 
1594   // Adds biases to the feature maps in input_data producing
1595   // output_data. input_data can equal output_data, but must not
1596   // partially overlap it.
1597   //
1598   // Let K = count() * height() * width() and N = feature_map_count()
1599   // on dimensions. Then input_value contains K*N values and biases
1600   // contains N values. We can thus logically consider input_value to
1601   // contain K vectors of N elements each. This function adds biases
1602   // to each of those N vectors.
1603   //
1604   // TODO(broune): This works differently when width() * height() > 1
1605   // and the call to ThenBiasAdd() follows a call to ThenMatMul(). In
1606   // that case there should be width() * height() *
1607   // feature_map_count() biases, but this is not implemented on all
1608   // StreamExecutors.
1609   //
1610   // Arguments (all borrowed):
1611   //  stream: borrowed pointer to the stream that the 'bias add' operation
1612   //    should be enqueued onto.
1613   //  input_data: un-owned device memory region containing the input.
1614   //  biases: un-owned device memory region containing biases to add to the
1615   //    input.
1616   //  dimensions: dimensions of input_data and output_data.
1617   //  output_data: un-owned device memory region in which to place the result.
1618   virtual bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data,
1619                          const DeviceMemory<float>& biases,
1620                          const dnn::BatchDescriptor& dimensions,
1621                          DeviceMemory<float>* output_data) = 0;
1622 
1623   // Performs a forward pooling operation on input_data, writing to
1624   // output_data. See PoolingDescriptor for how to configure the
1625   // pooling operation.
1626   //
1627   // Pooling happens as a window that moves across the Y and X
1628   // dimensions of input_data, where each position of the window
1629   // yields one output value. E.g. for max pooling, the computed value
1630   // is the maximum element in the window. The operation is applied
1631   // independently to each batch and at each feature map (depth), so
1632   // that the output depth and feature_map_count are the same as for
1633   // the input. The output width and height can be different.
1634   //
1635   // See PoolingDescriptor for how to configure the pooling operation.
1636   virtual bool DoPoolForward(Stream* stream,
1637                              const dnn::PoolingDescriptor& pooling_dimensions,
1638                              const dnn::BatchDescriptor& input_dimensions,
1639                              const DeviceMemory<float>& input_data,
1640                              const dnn::BatchDescriptor& output_dimensions,
1641                              DeviceMemory<float>* output_data,
1642                              ScratchAllocator* workspace_allocator) = 0;
1643 
DoPoolForward(Stream * stream,const dnn::PoolingDescriptor & pooling_dimensions,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<double> & input_data,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<double> * output_data,ScratchAllocator * workspace_allocator)1644   virtual bool DoPoolForward(Stream* stream,
1645                              const dnn::PoolingDescriptor& pooling_dimensions,
1646                              const dnn::BatchDescriptor& input_dimensions,
1647                              const DeviceMemory<double>& input_data,
1648                              const dnn::BatchDescriptor& output_dimensions,
1649                              DeviceMemory<double>* output_data,
1650                              ScratchAllocator* workspace_allocator) {
1651     LOG(FATAL) << "DoPoolForward not implemented for double.";
1652     return false;
1653   }
1654 
DoPoolForward(Stream * stream,const dnn::PoolingDescriptor & pooling_dimensions,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<Eigen::half> & input_data,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<Eigen::half> * output_data,ScratchAllocator * workspace_allocator)1655   virtual bool DoPoolForward(Stream* stream,
1656                              const dnn::PoolingDescriptor& pooling_dimensions,
1657                              const dnn::BatchDescriptor& input_dimensions,
1658                              const DeviceMemory<Eigen::half>& input_data,
1659                              const dnn::BatchDescriptor& output_dimensions,
1660                              DeviceMemory<Eigen::half>* output_data,
1661                              ScratchAllocator* workspace_allocator) {
1662     LOG(FATAL) << "DoPoolForward not implemented for float16.";
1663     return false;
1664   }
1665 
DoPoolForward(Stream * stream,const dnn::PoolingDescriptor & pooling_dimensions,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<int8> & input_data,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<int8> * output_data,ScratchAllocator * workspace_allocator)1666   virtual bool DoPoolForward(Stream* stream,
1667                              const dnn::PoolingDescriptor& pooling_dimensions,
1668                              const dnn::BatchDescriptor& input_dimensions,
1669                              const DeviceMemory<int8>& input_data,
1670                              const dnn::BatchDescriptor& output_dimensions,
1671                              DeviceMemory<int8>* output_data,
1672                              ScratchAllocator* workspace_allocator) {
1673     LOG(FATAL) << "DoPoolForward not implemented for int8.";
1674     return false;
1675   }
1676 
1677   // Performs differentiation of the pooling operation.
DoPoolBackward(Stream * stream,const dnn::PoolingDescriptor & pooling_dimensions,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<double> & input_data,const dnn::BatchDescriptor & output_dimensions,const DeviceMemory<double> & output_data,const DeviceMemory<double> & input_diff_data,DeviceMemory<double> * output_diff_data,ScratchAllocator * workspace_allocator)1678   virtual bool DoPoolBackward(Stream* stream,
1679                               const dnn::PoolingDescriptor& pooling_dimensions,
1680                               const dnn::BatchDescriptor& input_dimensions,
1681                               const DeviceMemory<double>& input_data,
1682                               const dnn::BatchDescriptor& output_dimensions,
1683                               const DeviceMemory<double>& output_data,
1684                               const DeviceMemory<double>& input_diff_data,
1685                               DeviceMemory<double>* output_diff_data,
1686                               ScratchAllocator* workspace_allocator) {
1687     LOG(FATAL) << "DoPoolBackward not implemented.";
1688     return false;
1689   }
1690 
DoPoolBackward(Stream * stream,const dnn::PoolingDescriptor & pooling_dimensions,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<float> & input_data,const dnn::BatchDescriptor & output_dimensions,const DeviceMemory<float> & output_data,const DeviceMemory<float> & input_diff_data,DeviceMemory<float> * output_diff_data,ScratchAllocator * workspace_allocator)1691   virtual bool DoPoolBackward(Stream* stream,
1692                               const dnn::PoolingDescriptor& pooling_dimensions,
1693                               const dnn::BatchDescriptor& input_dimensions,
1694                               const DeviceMemory<float>& input_data,
1695                               const dnn::BatchDescriptor& output_dimensions,
1696                               const DeviceMemory<float>& output_data,
1697                               const DeviceMemory<float>& input_diff_data,
1698                               DeviceMemory<float>* output_diff_data,
1699                               ScratchAllocator* workspace_allocator) {
1700     LOG(FATAL) << "DoPoolBackward not implemented.";
1701     return false;
1702   }
1703 
DoPoolBackward(Stream * stream,const dnn::PoolingDescriptor & pooling_dimensions,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<Eigen::half> & input_data,const dnn::BatchDescriptor & output_dimensions,const DeviceMemory<Eigen::half> & output_data,const DeviceMemory<Eigen::half> & input_diff_data,DeviceMemory<Eigen::half> * output_diff_data,ScratchAllocator * workspace_allocator)1704   virtual bool DoPoolBackward(Stream* stream,
1705                               const dnn::PoolingDescriptor& pooling_dimensions,
1706                               const dnn::BatchDescriptor& input_dimensions,
1707                               const DeviceMemory<Eigen::half>& input_data,
1708                               const dnn::BatchDescriptor& output_dimensions,
1709                               const DeviceMemory<Eigen::half>& output_data,
1710                               const DeviceMemory<Eigen::half>& input_diff_data,
1711                               DeviceMemory<Eigen::half>* output_diff_data,
1712                               ScratchAllocator* workspace_allocator) {
1713     LOG(FATAL) << "DoPoolBackward not implemented.";
1714     return false;
1715   }
1716 
1717   // Applies local response normalization to the values from input_data and
1718   // writes the result to output_data.
1719   //
1720   // See comments on NormalizeDescriptor for a description of local response
1721   // normalization.
DoNormalizeWithDimensions(Stream * stream,const dnn::NormalizeDescriptor & normalize_descriptor,const dnn::BatchDescriptor & dimensions,const DeviceMemory<float> & input_data,DeviceMemory<float> * output_data)1722   virtual bool DoNormalizeWithDimensions(
1723       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
1724       const dnn::BatchDescriptor& dimensions,
1725       const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
1726     return false;
1727   }
1728 
1729   // Performs backpropagation for the normalization operation
1730   //
1731   // Given raw data, its corresponding normalized output, and a gradient of some
1732   // unspecified function with respect to the normalized variables, computes the
1733   // gradient of that unspecified function with respect to the raw variables.
1734   //
1735   // The normalized data input array is expected to match the output that would
1736   // be obtained by running the raw data input array through the DoNormalize
1737   // method above.
1738   //
1739   // See comments on NormalizeDescriptor for a description of local response
1740   // normalization.
DoNormalizeBackwardWithDimensions(Stream * stream,const dnn::NormalizeDescriptor & normalize_descriptor,const dnn::BatchDescriptor & dimensions,const DeviceMemory<float> & raw_data,const DeviceMemory<float> & normalized_data,const DeviceMemory<float> & normalized_variable_gradient,DeviceMemory<float> * raw_variable_gradient,ScratchAllocator * workspace_allocator)1741   virtual bool DoNormalizeBackwardWithDimensions(
1742       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
1743       const dnn::BatchDescriptor& dimensions,
1744       const DeviceMemory<float>& raw_data,
1745       const DeviceMemory<float>& normalized_data,
1746       const DeviceMemory<float>& normalized_variable_gradient,
1747       DeviceMemory<float>* raw_variable_gradient,
1748       ScratchAllocator* workspace_allocator) {
1749     return false;
1750   }
1751 
1752   // Applies an activation function (see ActivationMode) to all of the values
1753   // held on the device in 'input_data', whose dimensions are described by
1754   // 'dimensions'.
1755   //
1756   // Arguments (all borrowed):
1757   //  stream: borrowed pointer to the stream that the 'activate' operation
1758   //    should be enqueued onto.
1759   //  activation_mode: Type of activation to perform.
1760   //  input_data: un-owned device memory region which contains the
1761   //    activate input.
1762   //  output_data: un-owned device memory region in which to place the
1763   //    activate result.
DoActivate(Stream * stream,ActivationMode activation_mode,const BatchDescriptor & dimensions,const DeviceMemory<float> & input_data,DeviceMemory<float> * output_data,uint64 options)1764   virtual bool DoActivate(Stream* stream, ActivationMode activation_mode,
1765                           const BatchDescriptor& dimensions,
1766                           const DeviceMemory<float>& input_data,
1767                           DeviceMemory<float>* output_data, uint64 options) {
1768     return false;
1769   }
1770 
1771   // Concatenates several layers into one, by concatenating the depth of each
1772   // layer at matching x and y coordinates.
1773   // The inputs must all have the same width and height, the output will have
1774   // the same width and height as the inputs and its depth will be the sum of
1775   // the input depths.
1776   //
1777   // Arguments (all borrowed):
1778   //  stream: borrowed pointer to the stream that the 'depth concatenate'
1779   // operation should be enqueued onto.
1780   //  input_dimensions: The dimensions of each input.
1781   //  input_data: un-owned device memory region which contains the
1782   //    input data for each input layer.
1783   //  output_data: un-owned device memory region in which to place the
1784   //    depth concatenate result.
1785   virtual bool DoDepthConcatenate(
1786       Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1787       port::ArraySlice<const DeviceMemory<float>*> input_data,
1788       DeviceMemory<float>* output_data) = 0;
1789 
1790   // Concatenates several layers into one, by concatenating each in the
1791   // x-dimension or y-dimension, based on a user-specified flag.
1792   // For x-concatenation, layers are aligned at matching y and depth
1793   // coordinates, and for y-concatenation, they are aligned at matching x and
1794   // depth coordinates. The inputs must all have the same depth and batch size.
1795   // For x-concatenation, the inputs must have the same height (y-size), and the
1796   // output will have the same depth and height as the inputs and its width (x-
1797   // size) will be the sum of the input widths.  For y-concatenation, the inputs
1798   // must have the same width, and the output will have the same depth and width
1799   // as the inputs, and its height will be the sum of the input heights.
1800   //
1801   // Arguments:
1802   //  stream: borrowed pointer to the stream that the 'space concatenate'
1803   //    operation should be enqueued onto.
1804   //  input_dimensions: the dimensions of each input.
1805   //  input_data: un-owned device memory region which contains the input data
1806   //    for each input layer.
1807   //  output_data: un-owned device memory region in which to place the space
1808   //    concatenate result.
1809   //  concat_direction:  either dnn:SpaceConcatenateMode::XDirection or
1810   //    dnn::SpaceConcatenateMode::YDirection.
DoSpaceConcatenate(Stream * stream,port::ArraySlice<dnn::BatchDescriptor> input_dimensions,port::ArraySlice<const DeviceMemory<float> * > input_data,DeviceMemory<float> * output_data,dnn::SpaceConcatenateMode concat_direction)1811   virtual bool DoSpaceConcatenate(
1812       Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1813       port::ArraySlice<const DeviceMemory<float>*> input_data,
1814       DeviceMemory<float>* output_data,
1815       dnn::SpaceConcatenateMode concat_direction) {
1816     return false;
1817   }
1818 
1819   // Change the layout of the data by shrinking one dimension (or set of
1820   // dimensions) and growing another dimension (or set of dimensions), while
1821   // keeping the total number of data elements constant, and maintaining the
1822   // current data ordering.
1823   //
1824   // Currently, the only supported operation is depth into space by a power of
1825   // 2. E.g. (y, x, z) -> (y*2, x*2, z/4)
1826   //
1827   // Note that Reshape may not be a no-op, depending on the platform and which
1828   // dimensions are being changed.
1829   //
1830   // Example: forgetting about batch for the moment, let's take a tensor that's
1831   // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout
1832   // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The
1833   // elements of the tensor range from 0 to 15. The x,y,z indices are below each
1834   // element.
1835   //
1836   //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
1837   // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1
1838   // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0
1839   // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7
1840   //
1841   // reshape to 4x2x2
1842   //
1843   //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
1844   // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3
1845   // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1
1846   // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1
DoReshape(Stream * stream,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<float> & input_data,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<float> * output_data)1847   virtual bool DoReshape(Stream* stream,
1848                          const dnn::BatchDescriptor& input_dimensions,
1849                          const DeviceMemory<float>& input_data,
1850                          const dnn::BatchDescriptor& output_dimensions,
1851                          DeviceMemory<float>* output_data) {
1852     return false;
1853   }
1854 
1855   // Depth to space takes an X by Y image with depth D*M^2 and changes it to an
1856   // MX x MY image with depth D. Each input location (x,y) with depth D*M^2 in
1857   // the input image is changed to an MxM contiguous area in the output image,
1858   // with the values being laid out in the raster order by DepthToSpaceLayout,
1859   // and will have a new depth of D.
1860   //
1861   // Example.
1862   // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4,  Dout=2
1863   // DepthHeightWidth layout
1864   // Values within a 'cell' are at different depths and same x & y.
1865   // Input:
1866   // abcdefgh  ijklmnop
1867   // qrstuvwx  yz012345
1868   // Output:
1869   // ae bf im jn
1870   // cg dh ko lp
1871   // qu rv y2 z3
1872   // sw tx 04 15
1873   //
1874   // sqrt_depth_reduction: 'M' in the comment above
DoDepthToSpace(Stream * stream,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<float> & input_data,const DepthToSpaceLayout & depth_to_space_layout,const int & sqrt_depth_reduction,DeviceMemory<float> * output_data)1875   virtual bool DoDepthToSpace(Stream* stream,
1876                               const dnn::BatchDescriptor& input_dimensions,
1877                               const DeviceMemory<float>& input_data,
1878                               const DepthToSpaceLayout& depth_to_space_layout,
1879                               const int& sqrt_depth_reduction,
1880                               DeviceMemory<float>* output_data) {
1881     return false;
1882   }
1883 
1884   // Space to depth is the inverse of depth to space. Space to depth takes each
1885   // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
1886   // the input, and transforms it to a 1 by 1 patch with depth D*M^2. If the
1887   // input has size (MX, MY, D), the output has size (X, Y, D*M^2). The number
1888   // of data elements is not changed.
1889   //
1890   // Example.
1891   // M=2, Din =2, Xin=4, Yin=4,  Dout=8
1892   // DepthHeightWidth layout
1893   // Values within a 'cell' are at different depths and same x & y.
1894   // Input:
1895   // ae bf im jn
1896   // cg dh ko lp
1897   // qu rv y2 z3
1898   // sw tx 04 15
1899   // Output:
1900   // abcdefgh  ijklmnop
1901   // qrstuvwx  yz012345
1902   //
1903   // sqrt_depth_increase: 'M' in the comment above
DoSpaceToDepth(Stream * stream,const dnn::BatchDescriptor & input_dimensions,const DeviceMemory<float> & input_data,const DepthToSpaceLayout & space_to_depth_layout,const int & sqrt_depth_increase,DeviceMemory<float> * output_data)1904   virtual bool DoSpaceToDepth(Stream* stream,
1905                               const dnn::BatchDescriptor& input_dimensions,
1906                               const DeviceMemory<float>& input_data,
1907                               const DepthToSpaceLayout& space_to_depth_layout,
1908                               const int& sqrt_depth_increase,
1909                               DeviceMemory<float>* output_data) {
1910     return false;
1911   }
1912 
1913   // Computes the specified operation (e.g. addition or multiplication)
1914   // between corresponding elements in the inputs and stores the result in the
1915   // output element.
1916   // The inputs and output must all have the same dimensions, but may have
1917   // different quantization parameters (min_value and max_value).
1918   //
1919   // Arguments (all borrowed):
1920   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1921   // should be enqueued onto.
1922   //  operation: The operation to perform.
1923   //  input_dimensions: The dimensions of each input.
1924   //  input_data: un-owned device memory region which contains the
1925   //    input data for each input layer.
1926   //  output_dimensions: The dimensions of the output.
1927   //  output_data: un-owned device memory region in which to place the
1928   //    operation result.
1929   virtual bool DoElementwiseOperate(
1930       Stream* stream, ElementwiseOperation operation,
1931       port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1932       port::ArraySlice<const DeviceMemory<float>*> input_data,
1933       const dnn::BatchDescriptor& output_dimensions,
1934       DeviceMemory<float>* output_data) = 0;
1935 
1936   // Computes the specified operation (e.g. addition or multiplication)
1937   // between corresponding elements in the inputs and stores the result in the
1938   // output element. Each input is multiplied by a scalar constant and the
1939   // result is divided by a scalar constant.
1940   // e.g. To perform Z = 0.9*X + 1.1*Y, set the input multiplicands to 9 and 11
1941   // and the output divisor to 10.
1942   // The inputs and output must all have the same dimensions, but may have
1943   // different quantization parameters (min_value and max_value).
1944   //
1945   // Arguments (all borrowed):
1946   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1947   // should be enqueued onto.
1948   //  operation: The operation to perform.
1949   //  input_multiplicands: Amount to scale each input.
1950   //  output_divisor: Amount to divide the output.
1951   //  input_dimensions: The dimensions of each input.
1952   //  input_data: un-owned device memory region which contains the
1953   //    input data for each input layer.
1954   //  output_dimensions: The dimensions of the output.
1955   //  output_data: un-owned device memory region in which to place the
1956   //    operation result.
DoElementwiseOperateScaledQuantized(Stream * stream,ElementwiseOperation operation,port::ArraySlice<int> input_multiplicands,int output_divisor,port::ArraySlice<dnn::BatchDescriptor> input_dimensions,port::ArraySlice<const DeviceMemory<float> * > input_data,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<float> * output_data)1957   virtual bool DoElementwiseOperateScaledQuantized(
1958       Stream* stream, ElementwiseOperation operation,
1959       port::ArraySlice<int> input_multiplicands, int output_divisor,
1960       port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1961       port::ArraySlice<const DeviceMemory<float>*> input_data,
1962       const dnn::BatchDescriptor& output_dimensions,
1963       DeviceMemory<float>* output_data) {
1964     return false;
1965   }
1966 
1967   // Pads the input with zeros in the X and Y dimensions. The feature_map
1968   // dimension is unchanged.
1969   //
1970   // Arguments (all borrowed):
1971   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1972   // should be enqueued onto.
1973   //  dimensions: The dimensions of the input.
1974   //  input_data: un-owned device memory region which contains the
1975   //    input data for the input layer.
1976   //  left_pad: Amount to pad the input on the left.
1977   //  right_pad: Amount to pad the input on the right.
1978   //  top_pad: Amount to pad the input at the top (low Y).
1979   //  bottom_pad: Amount to pad the input at the bottom (high Y).
1980   //  output_data: un-owned device memory region in which to place the
1981   //    padded result.
1982   virtual bool DoXYPad(Stream* stream, const dnn::BatchDescriptor &dimensions,
1983                        const DeviceMemory<float> &input_data,
1984                        int64 left_pad, int64 right_pad, int64 top_pad,
1985                        int64 bottom_pad, DeviceMemory<float> *output_data) = 0;
1986 
1987   // Extracts a slice of the input in the X and Y dimensions. The feature_map
1988   // dimension is unchanged.
1989   //
1990   // Arguments (all borrowed):
1991   //  stream: borrowed pointer to the stream that the 'elementwise operation'
1992   // should be enqueued onto.
1993   //  dimensions: The dimensions of the input.
1994   //  input_data: un-owned device memory region which contains the
1995   //    input data for the input layer.
1996   //  left_trim: Amount to cut off the input on the left.
1997   //  right_trim: Amount to cut off the input on the right.
1998   //  top_trim: Amount to cut off the input at the top (low y).
1999   //  bottom_trim: Amount to cut off the input at the bottom (high Y).
2000   //  output_data: un-owned device memory region in which to place the
2001   //    padded result.
2002   virtual bool DoXYSlice(Stream* stream, const dnn::BatchDescriptor &dimensions,
2003                     const DeviceMemory<float> &input_data,
2004                     int64 left_trim, int64 right_trim, int64 top_trim,
2005                     int64 bottom_trim, DeviceMemory<float> *output_data) = 0;
2006 
2007   // Grows the input tensor by replicating the X and Y dimensions. The batch and
2008   // depth/feature_map dimensions are unchanged. Currently, the input tensor is
2009   // limited to X=1 and Y=1.
2010   //
2011   // For example, the input has dimensions x=2, y=3, and replicate_x=3,
2012   // replicate_y=2. The diagonal elements of the output would be: [x0y0, x1y1,
2013   // x0y2, x1y0, x0y1, x1y2].
2014   // Here is the example as a picture. input:
2015   // AB
2016   // CD
2017   // EF
2018   // broadcast result:
2019   // ABABAB
2020   // CDCDCD
2021   // EFEFEF
2022   // ABABAB
2023   // CDCDCD
2024   // EFEFEF
2025   //
2026   // Arguments (all borrowed):
2027   //  stream: borrowed pointer to the stream that the 'elementwise operation'
2028   // should be enqueued onto.
2029   //  dimensions: The dimensions of the input.
2030   //  input_data: un-owned device memory region which contains the
2031   //    input data for the input layer.
2032   //  replicate_x: Amount to replicate the input's X dimension.
2033   //  replicate_y: Amount to replicate the input's Y dimension.
2034   //  output_data: un-owned device memory region in which to place the
2035   //    padded result.
DoXYBroadcast(Stream * stream,const dnn::BatchDescriptor & dimensions,const DeviceMemory<float> & input_data,int64 replicate_x,int64 replicate_y,DeviceMemory<float> * output_data)2036   virtual bool DoXYBroadcast(Stream* stream,
2037                              const dnn::BatchDescriptor& dimensions,
2038                              const DeviceMemory<float>& input_data,
2039                              int64 replicate_x, int64 replicate_y,
2040                              DeviceMemory<float>* output_data) {
2041     return false;
2042   }
2043 
2044   // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that
2045   // is, bytes instead of scaled floats) into 'host_dst' if they are available
2046   // for the underlying DNN implementation. If this quantized output is not
2047   // available, false is returned, which will place 'stream' into an error
2048   // state.
2049   //
2050   // Arguments (all borrowed):
2051   //  stream: borrowed pointer to the stream that the 'quantized memcpy'
2052   //    operation should be enqueued onto.
2053   //  gpu_unquantized_src: the device memory that contains the unquantized data
2054   //    -- this data should also have a corresponding quantized representation
2055   //    on the device for this operation to succeed.
2056   //  mode: Type of quantization of the data to write into host_dst.
2057   //  host_dst: un-owned host memory region that is mutated in place,
2058   //    it is clobbered by the values in 'gpu_unquantized_src' when the enqueued
2059   //    (asynchronous) memcpy operation is performed.
2060   //  size: size in bytes of the host_dst host memory region.
2061   virtual bool DoMemcpyD2HQuantized(
2062       Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
2063       QuantizedActivationMode mode, void* host_dst, int64 size) = 0;
2064 
2065   // Enqueues an asynchronous memcpy of 'host_dst' into the *quantized* input
2066   // of a layer (that is, bytes instead of scaled floats) if they are supported
2067   // by the underlying DNN implementation. If this quantized input is not
2068   // supported, false is returned, which will place 'stream' into an error
2069   // state.
2070   //
2071   // Arguments (all borrowed):
2072   //  stream: borrowed pointer to the stream that the 'quantized memcpy'
2073   //    operation should be enqueued onto.
2074   //  host_src: un-owned host memory region that contains the quantized data.
2075   //  size: size in bytes of the host_src host memory region.
2076   //  mode: Type of quantization of the data to read from host_src.
2077   //  gpu_unquantized_dst: the device memory that is clobbered by the values in
2078   //    'host_src' when the enqueued (asynchronous) memcpy operation is
2079   //    performed. -- this data should also have a corresponding quantized
2080   //    representation on the device for this operation to
2081   //    succeed.
2082   virtual bool DoMemcpyH2DQuantized(
2083       Stream* stream, const void* host_src, int64 size,
2084       QuantizedActivationMode mode,
2085       DeviceMemory<float>* gpu_unquantized_dst) = 0;
2086 
2087   // Create an RNN descriptor based on model shapes and configurations.
2088   // The caller retains the ownership of the descriptor.
2089   //
2090   // Arguments:
2091   //  num_layers: the number of layers for a RNN model.
2092   //  hidden_size: the size of the hidden state.
2093   //  input_size: the size of the input state.
2094   //  cell_size: the size of the cell state
2095   //  input_mode: an enum to specify whether a linear transformation is added
2096   //    after the input state. If input_size is different from hidden_size, this
2097   //    is required.
2098   //  direction_mode: an enum to specify whether this model is unidirectional or
2099   //    bidirectional.
2100   //  rnn_mode: an enum to specify the type of model to build.
2101   //  data_type: an enum to specify the data types used in this model.
2102   //  dropout: the dropout threshold between layers. When it is 0., no dropout
2103   //    is added.
2104   //  seed: a seed for initializing the dropout layers.
2105   //  state_allocator: an memory allocator that will be used to store the state
2106   //    for dropout layer. The user has to maintain the memory until the model
2107   //    is no longer in use.
2108   //  use_padded_io: a bool to specify whether the input is using padded IO.
2109   virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
createRnnDescriptor(int num_layers,int hidden_size,int input_size,int cell_size,int batch_size,dnn::RnnInputMode input_mode,dnn::RnnDirectionMode direction_mode,dnn::RnnMode rnn_mode,dnn::DataType data_type,const dnn::AlgorithmConfig & algorithm_config,float dropout,uint64 seed,ScratchAllocator * state_allocator,bool use_padded_io)2110   createRnnDescriptor(int num_layers, int hidden_size, int input_size,
2111                       int cell_size, int batch_size,
2112                       dnn::RnnInputMode input_mode,
2113                       dnn::RnnDirectionMode direction_mode,
2114                       dnn::RnnMode rnn_mode, dnn::DataType data_type,
2115                       const dnn::AlgorithmConfig& algorithm_config,
2116                       float dropout, uint64 seed,
2117                       ScratchAllocator* state_allocator, bool use_padded_io) {
2118     return port::Status(port::error::UNIMPLEMENTED,
2119                         "createRnnDescriptor is unimplemented");
2120   }
2121 
2122   // Create a RNN sequence descriptor that specifies either the input or output
2123   // sequence. The caller retains the ownership of the returned descriptor.
2124   //
2125   // Arguments:
2126   //  max_seq_length: the max length of the sequences.
2127   //  batch_size: the size of a minibatch.
2128   //  data_size: the size of the state.
2129   //  seq_lengths: the lengths of sequences in a batch.
2130   //  data_type: an enum to specify the type for the underlying data.
2131   virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
createRnnSequenceTensorDescriptor(int max_seq_length,int batch_size,int data_size,dnn::DataType data_type)2132   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
2133                                     int data_size, dnn::DataType data_type) {
2134     return port::Status(port::error::UNIMPLEMENTED,
2135                         "createRnnSequenceTensorDescriptor is unimplemented");
2136   }
2137 
2138   virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
createRnnSequenceTensorDescriptor(int max_seq_length,int batch_size,int data_size,const absl::Span<const int> & seq_lengths,bool time_major,dnn::DataType data_type)2139   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
2140                                     int data_size,
2141                                     const absl::Span<const int>& seq_lengths,
2142                                     bool time_major, dnn::DataType data_type) {
2143     return port::Status(port::error::UNIMPLEMENTED,
2144                         "createRnnSequenceTensorDescriptor is unimplemented");
2145   }
2146 
2147   // Create an RNN state descriptor that specifies the input or hidden state.
2148   // The caller retains the ownership of the returned descriptor.
2149   virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
createRnnStateTensorDescriptor(int num_layer,int batch_size,int data_size,dnn::DataType data_type)2150   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
2151                                  dnn::DataType data_type) {
2152     return port::Status(port::error::UNIMPLEMENTED,
2153                         "createRnnStateTensorDescriptor is unimplemented");
2154   }
2155 
2156   // Enqueue a forward operation of the RNN model onto the stream.
2157   //
2158   // Arguments:
2159   //  stream: pointer to the stream where this operation should be enqueued to.
2160   //  rnn_desc: a RNN descriptor created by createRnnDescriptor.
2161   //  input_desc: descriptor for the input sequence.
2162   //  input_data: the device memory region that contains the input data.
2163   //  input_h_desc: descriptor for the input "h" state.
2164   //  input_h_data: the device memory region that contains the input "h" data.
2165   //  input_c_desc: descriptor for the input "c" state.
2166   //  input_c_data: the device memory region that contains the input "c" data.
2167   //    This must be specified for LSTM models.
2168   //  params: the device memory region that contains the parameters used in this
2169   //    model.
2170   //  output_desc: descriptor for the output sequence.
2171   //  output_data: the memory region that stores the output sequence data.
2172   //  output_h_desc: descriptor for the output "h" state.
2173   //  output_h_data: the memory region that stores the output "h" data.
2174   //  output_c_desc: descriptor for the output "c" state.
2175   //  output_c_data: the memory region that stores the output "c" data. This
2176   //    must be specified for LSTM models.
2177   //  is_training: whether this is used in training or inference. That decides
2178   //    whether respace_space data need to be produced.
2179   //  reserve_space_allocator: if "is_training" is true, an memory allocator
2180   //    to create memory that holds the produced reserve_space. The caller is
2181   //  retains the data and feed it to the backward pass.
2182   //  workspace_allocator: an allocator to create temporary workspace used in
2183   //    this kernel. The caller is responsible for retaining the memory long
2184   //    enough for the lifespan of this operation, and recycles afterwards.
DoRnnForward(Stream * stream,const dnn::RnnDescriptor & rnn_desc,const dnn::RnnSequenceTensorDescriptor & input_desc,const DeviceMemory<Eigen::half> & input_data,const dnn::RnnStateTensorDescriptor & input_h_desc,const DeviceMemory<Eigen::half> & input_h_data,const dnn::RnnStateTensorDescriptor & input_c_desc,const DeviceMemory<Eigen::half> & input_c_data,const DeviceMemory<Eigen::half> & params,const dnn::RnnSequenceTensorDescriptor & output_desc,DeviceMemory<Eigen::half> * output_data,const dnn::RnnStateTensorDescriptor & output_h_desc,DeviceMemory<Eigen::half> * output_h_data,const dnn::RnnStateTensorDescriptor & output_c_desc,DeviceMemory<Eigen::half> * output_c_data,bool is_training,ScratchAllocator * reserve_space_allocator,ScratchAllocator * workspace_allocator,dnn::ProfileResult * output_profile_result)2185   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2186                             const dnn::RnnSequenceTensorDescriptor& input_desc,
2187                             const DeviceMemory<Eigen::half>& input_data,
2188                             const dnn::RnnStateTensorDescriptor& input_h_desc,
2189                             const DeviceMemory<Eigen::half>& input_h_data,
2190                             const dnn::RnnStateTensorDescriptor& input_c_desc,
2191                             const DeviceMemory<Eigen::half>& input_c_data,
2192                             const DeviceMemory<Eigen::half>& params,
2193                             const dnn::RnnSequenceTensorDescriptor& output_desc,
2194                             DeviceMemory<Eigen::half>* output_data,
2195                             const dnn::RnnStateTensorDescriptor& output_h_desc,
2196                             DeviceMemory<Eigen::half>* output_h_data,
2197                             const dnn::RnnStateTensorDescriptor& output_c_desc,
2198                             DeviceMemory<Eigen::half>* output_c_data,
2199                             bool is_training,
2200                             ScratchAllocator* reserve_space_allocator,
2201                             ScratchAllocator* workspace_allocator,
2202                             dnn::ProfileResult* output_profile_result) {
2203     return false;
2204   }
2205 
DoRnnForward(Stream * stream,const dnn::RnnDescriptor & rnn_desc,const dnn::RnnSequenceTensorDescriptor & input_desc,const DeviceMemory<float> & input_data,const dnn::RnnStateTensorDescriptor & input_h_desc,const DeviceMemory<float> & input_h_data,const dnn::RnnStateTensorDescriptor & input_c_desc,const DeviceMemory<float> & input_c_data,const DeviceMemory<float> & params,const dnn::RnnSequenceTensorDescriptor & output_desc,DeviceMemory<float> * output_data,const dnn::RnnStateTensorDescriptor & output_h_desc,DeviceMemory<float> * output_h_data,const dnn::RnnStateTensorDescriptor & output_c_desc,DeviceMemory<float> * output_c_data,bool is_training,ScratchAllocator * reserve_space_allocator,ScratchAllocator * workspace_allocator,dnn::ProfileResult * output_profile_result)2206   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2207                             const dnn::RnnSequenceTensorDescriptor& input_desc,
2208                             const DeviceMemory<float>& input_data,
2209                             const dnn::RnnStateTensorDescriptor& input_h_desc,
2210                             const DeviceMemory<float>& input_h_data,
2211                             const dnn::RnnStateTensorDescriptor& input_c_desc,
2212                             const DeviceMemory<float>& input_c_data,
2213                             const DeviceMemory<float>& params,
2214                             const dnn::RnnSequenceTensorDescriptor& output_desc,
2215                             DeviceMemory<float>* output_data,
2216                             const dnn::RnnStateTensorDescriptor& output_h_desc,
2217                             DeviceMemory<float>* output_h_data,
2218                             const dnn::RnnStateTensorDescriptor& output_c_desc,
2219                             DeviceMemory<float>* output_c_data,
2220                             bool is_training,
2221                             ScratchAllocator* reserve_space_allocator,
2222                             ScratchAllocator* workspace_allocator,
2223                             dnn::ProfileResult* output_profile_result) {
2224     return false;
2225   }
2226 
DoRnnForward(Stream * stream,const dnn::RnnDescriptor & rnn_desc,const dnn::RnnSequenceTensorDescriptor & input_desc,const DeviceMemory<double> & input_data,const dnn::RnnStateTensorDescriptor & input_h_desc,const DeviceMemory<double> & input_h_data,const dnn::RnnStateTensorDescriptor & input_c_desc,const DeviceMemory<double> & input_c_data,const DeviceMemory<double> & params,const dnn::RnnSequenceTensorDescriptor & output_desc,DeviceMemory<double> * output_data,const dnn::RnnStateTensorDescriptor & output_h_desc,DeviceMemory<double> * output_h_data,const dnn::RnnStateTensorDescriptor & output_c_desc,DeviceMemory<double> * output_c_data,bool is_training,ScratchAllocator * reserve_space_allocator,ScratchAllocator * workspace_allocator,dnn::ProfileResult * output_profile_result)2227   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2228                             const dnn::RnnSequenceTensorDescriptor& input_desc,
2229                             const DeviceMemory<double>& input_data,
2230                             const dnn::RnnStateTensorDescriptor& input_h_desc,
2231                             const DeviceMemory<double>& input_h_data,
2232                             const dnn::RnnStateTensorDescriptor& input_c_desc,
2233                             const DeviceMemory<double>& input_c_data,
2234                             const DeviceMemory<double>& params,
2235                             const dnn::RnnSequenceTensorDescriptor& output_desc,
2236                             DeviceMemory<double>* output_data,
2237                             const dnn::RnnStateTensorDescriptor& output_h_desc,
2238                             DeviceMemory<double>* output_h_data,
2239                             const dnn::RnnStateTensorDescriptor& output_c_desc,
2240                             DeviceMemory<double>* output_c_data,
2241                             bool is_training,
2242                             ScratchAllocator* reserve_space_allocator,
2243                             ScratchAllocator* workspace_allocator,
2244                             dnn::ProfileResult* output_profile_result) {
2245     return false;
2246   }
2247   // Enqueue a backward operation of the RNN model onto the stream.
2248   //
2249   // Arguments:
2250   //  stream: pointer to the stream where this operation should be enqueued to.
2251   //  rnn_desc: a RNN descriptor created by createRnnDescriptor.
2252   //  input_desc: descriptor for the input sequence.
2253   //  input_data: the device memory region that contains the input data.
2254   //  input_h_desc: descriptor for the input "h" state.
2255   //  input_h_data: the device memory region that contains the input "h" data.
2256   //  input_c_desc: descriptor for the input "c" state.
2257   //  input_c_data: the device memory region that contains the input "c" data.
2258   //    This must be specified for LSTM models.
2259   //  params: the device memory region that contains the parameters used in this
2260   //    model.
2261   //  output_desc: descriptor for the output sequence.
2262   //  output_data: the memory region that stores the output sequence data.
2263   //  output_h_desc: descriptor for the output "h" state.
2264   //  output_h_data: the memory region that stores the output "h" data.
2265   //  output_c_desc: descriptor for the output "c" state.
2266   //  output_c_data: the memory region that stores the output "c" data. This
2267   //    must be specified for LSTM models.
2268   //  output_backprop_data: the device memory region that contains the backprop
2269   //    to the output sequence.
2270   //  output_h_backprop_data: the device memory region that contains the
2271   //    backprop to the output "h" state.
2272   //  output_c_backprop_data: the device memory region that contains the
2273   //    backprop to the output "c" state.
2274   //  input_backprop_data: the device memory region that stores the backprop
2275   //    to the input sequence.
2276   //  input_h_backprop_data: the device memory region that stores the backprop
2277   //    to the input "h" state.
2278   //  input_c_backprop_data: the device memory region that stores the backprop
2279   //    to the input "c" state.
2280   //  params_backprop_data: the device memory region that stores the backprop
2281   //    to the parameters.
2282   //  reserve_space_data: the reserve_space data that is produced by the forward
2283   //    operation. This memory region could be modified by this operation.
2284   //  workspace_allocator: a memory allocator that creates the temporary
2285   //    workspace memory used by this operation. The caller is responsible for
2286   //    keeping the memory alive long enough for this operation, and recylces
2287   //    afterwards.
DoRnnBackward(Stream * stream,const dnn::RnnDescriptor & rnn_desc,const dnn::RnnSequenceTensorDescriptor & input_desc,const DeviceMemory<Eigen::half> & input_data,const dnn::RnnStateTensorDescriptor & input_h_desc,const DeviceMemory<Eigen::half> & input_h_data,const dnn::RnnStateTensorDescriptor & input_c_desc,const DeviceMemory<Eigen::half> & input_c_data,const DeviceMemory<Eigen::half> & params,const dnn::RnnSequenceTensorDescriptor & output_desc,const DeviceMemory<Eigen::half> & output_data,const dnn::RnnStateTensorDescriptor & output_h_desc,const DeviceMemory<Eigen::half> & output_h_data,const dnn::RnnStateTensorDescriptor & output_c_desc,const DeviceMemory<Eigen::half> & output_c_data,const DeviceMemory<Eigen::half> & output_backprop_data,const DeviceMemory<Eigen::half> & output_h_backprop_data,const DeviceMemory<Eigen::half> & output_c_backprop_data,DeviceMemory<Eigen::half> * input_backprop_data,DeviceMemory<Eigen::half> * input_h_backprop_data,DeviceMemory<Eigen::half> * input_c_backprop_data,DeviceMemory<Eigen::half> * params_backprop_data,DeviceMemory<uint8> * reserve_space_data,ScratchAllocator * workspace_allocator,dnn::ProfileResult * output_profile_result)2288   virtual bool DoRnnBackward(
2289       Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2290       const dnn::RnnSequenceTensorDescriptor& input_desc,
2291       const DeviceMemory<Eigen::half>& input_data,
2292       const dnn::RnnStateTensorDescriptor& input_h_desc,
2293       const DeviceMemory<Eigen::half>& input_h_data,
2294       const dnn::RnnStateTensorDescriptor& input_c_desc,
2295       const DeviceMemory<Eigen::half>& input_c_data,
2296       const DeviceMemory<Eigen::half>& params,
2297       const dnn::RnnSequenceTensorDescriptor& output_desc,
2298       const DeviceMemory<Eigen::half>& output_data,
2299       const dnn::RnnStateTensorDescriptor& output_h_desc,
2300       const DeviceMemory<Eigen::half>& output_h_data,
2301       const dnn::RnnStateTensorDescriptor& output_c_desc,
2302       const DeviceMemory<Eigen::half>& output_c_data,
2303       const DeviceMemory<Eigen::half>& output_backprop_data,
2304       const DeviceMemory<Eigen::half>& output_h_backprop_data,
2305       const DeviceMemory<Eigen::half>& output_c_backprop_data,
2306       DeviceMemory<Eigen::half>* input_backprop_data,
2307       DeviceMemory<Eigen::half>* input_h_backprop_data,
2308       DeviceMemory<Eigen::half>* input_c_backprop_data,
2309       DeviceMemory<Eigen::half>* params_backprop_data,
2310       DeviceMemory<uint8>* reserve_space_data,
2311       ScratchAllocator* workspace_allocator,
2312       dnn::ProfileResult* output_profile_result) {
2313     return false;
2314   }
2315 
DoRnnBackward(Stream * stream,const dnn::RnnDescriptor & rnn_desc,const dnn::RnnSequenceTensorDescriptor & input_desc,const DeviceMemory<float> & input_data,const dnn::RnnStateTensorDescriptor & input_h_desc,const DeviceMemory<float> & input_h_data,const dnn::RnnStateTensorDescriptor & input_c_desc,const DeviceMemory<float> & input_c_data,const DeviceMemory<float> & params,const dnn::RnnSequenceTensorDescriptor & output_desc,const DeviceMemory<float> & output_data,const dnn::RnnStateTensorDescriptor & output_h_desc,const DeviceMemory<float> & output_h_data,const dnn::RnnStateTensorDescriptor & output_c_desc,const DeviceMemory<float> & output_c_data,const DeviceMemory<float> & output_backprop_data,const DeviceMemory<float> & output_h_backprop_data,const DeviceMemory<float> & output_c_backprop_data,DeviceMemory<float> * input_backprop_data,DeviceMemory<float> * input_h_backprop_data,DeviceMemory<float> * input_c_backprop_data,DeviceMemory<float> * params_backprop_data,DeviceMemory<uint8> * reserve_space_data,ScratchAllocator * workspace_allocator,dnn::ProfileResult * output_profile_result)2316   virtual bool DoRnnBackward(
2317       Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2318       const dnn::RnnSequenceTensorDescriptor& input_desc,
2319       const DeviceMemory<float>& input_data,
2320       const dnn::RnnStateTensorDescriptor& input_h_desc,
2321       const DeviceMemory<float>& input_h_data,
2322       const dnn::RnnStateTensorDescriptor& input_c_desc,
2323       const DeviceMemory<float>& input_c_data,
2324       const DeviceMemory<float>& params,
2325       const dnn::RnnSequenceTensorDescriptor& output_desc,
2326       const DeviceMemory<float>& output_data,
2327       const dnn::RnnStateTensorDescriptor& output_h_desc,
2328       const DeviceMemory<float>& output_h_data,
2329       const dnn::RnnStateTensorDescriptor& output_c_desc,
2330       const DeviceMemory<float>& output_c_data,
2331       const DeviceMemory<float>& output_backprop_data,
2332       const DeviceMemory<float>& output_h_backprop_data,
2333       const DeviceMemory<float>& output_c_backprop_data,
2334       DeviceMemory<float>* input_backprop_data,
2335       DeviceMemory<float>* input_h_backprop_data,
2336       DeviceMemory<float>* input_c_backprop_data,
2337       DeviceMemory<float>* params_backprop_data,
2338       DeviceMemory<uint8>* reserve_space_data,
2339       ScratchAllocator* workspace_allocator,
2340       dnn::ProfileResult* output_profile_result) {
2341     return false;
2342   }
2343 
DoRnnBackward(Stream * stream,const dnn::RnnDescriptor & rnn_desc,const dnn::RnnSequenceTensorDescriptor & input_desc,const DeviceMemory<double> & input_data,const dnn::RnnStateTensorDescriptor & input_h_desc,const DeviceMemory<double> & input_h_data,const dnn::RnnStateTensorDescriptor & input_c_desc,const DeviceMemory<double> & input_c_data,const DeviceMemory<double> & params,const dnn::RnnSequenceTensorDescriptor & output_desc,const DeviceMemory<double> & output_data,const dnn::RnnStateTensorDescriptor & output_h_desc,const DeviceMemory<double> & output_h_data,const dnn::RnnStateTensorDescriptor & output_c_desc,const DeviceMemory<double> & output_c_data,const DeviceMemory<double> & output_backprop_data,const DeviceMemory<double> & output_h_backprop_data,const DeviceMemory<double> & output_c_backprop_data,DeviceMemory<double> * input_backprop_data,DeviceMemory<double> * input_h_backprop_data,DeviceMemory<double> * input_c_backprop_data,DeviceMemory<double> * params_backprop_data,DeviceMemory<uint8> * reserve_space_data,ScratchAllocator * workspace_allocator,dnn::ProfileResult * output_profile_result)2344   virtual bool DoRnnBackward(
2345       Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2346       const dnn::RnnSequenceTensorDescriptor& input_desc,
2347       const DeviceMemory<double>& input_data,
2348       const dnn::RnnStateTensorDescriptor& input_h_desc,
2349       const DeviceMemory<double>& input_h_data,
2350       const dnn::RnnStateTensorDescriptor& input_c_desc,
2351       const DeviceMemory<double>& input_c_data,
2352       const DeviceMemory<double>& params,
2353       const dnn::RnnSequenceTensorDescriptor& output_desc,
2354       const DeviceMemory<double>& output_data,
2355       const dnn::RnnStateTensorDescriptor& output_h_desc,
2356       const DeviceMemory<double>& output_h_data,
2357       const dnn::RnnStateTensorDescriptor& output_c_desc,
2358       const DeviceMemory<double>& output_c_data,
2359       const DeviceMemory<double>& output_backprop_data,
2360       const DeviceMemory<double>& output_h_backprop_data,
2361       const DeviceMemory<double>& output_c_backprop_data,
2362       DeviceMemory<double>* input_backprop_data,
2363       DeviceMemory<double>* input_h_backprop_data,
2364       DeviceMemory<double>* input_c_backprop_data,
2365       DeviceMemory<double>* params_backprop_data,
2366       DeviceMemory<uint8>* reserve_space_data,
2367       ScratchAllocator* workspace_allocator,
2368       dnn::ProfileResult* output_profile_result) {
2369     return false;
2370   }
2371 
2372   template <typename ElementType>
PrepareForCtcLoss(Stream * stream,const RnnStateTensorDescriptor & probs_desc,DeviceMemory<ElementType> probs_data,const RnnStateTensorDescriptor & grads_desc,absl::Span<const int> labels_data,absl::Span<const int> labels_lengths_data,absl::Span<const int> input_lengths_data,ScratchAllocator * workspace_allocator,DeviceMemory<uint8> * scratch_memory,int * ctc_loss_algo_id)2373   port::Status PrepareForCtcLoss(Stream* stream,
2374                                  const RnnStateTensorDescriptor& probs_desc,
2375                                  DeviceMemory<ElementType> probs_data,
2376                                  const RnnStateTensorDescriptor& grads_desc,
2377                                  absl::Span<const int> labels_data,
2378                                  absl::Span<const int> labels_lengths_data,
2379                                  absl::Span<const int> input_lengths_data,
2380                                  ScratchAllocator* workspace_allocator,
2381                                  DeviceMemory<uint8>* scratch_memory,
2382                                  int* ctc_loss_algo_id) {
2383     return DoPrepareForCtcLoss(
2384         stream, ToDataType<ElementType>::value, probs_desc, grads_desc,
2385         labels_data, labels_lengths_data, input_lengths_data,
2386         workspace_allocator, scratch_memory, ctc_loss_algo_id);
2387   }
2388 
2389   // Enqueue a CTC Loss operation onto the stream.
2390   //
2391   // Arguments:
2392   //  stream: pointer to the stream where this operation should be enqueued to.
2393   //  element_type: date type of the input tensors
2394   //  probs_desc: specifies the shape and the data layout of the input tensor.
2395   //  probs_data: the device memory region that contains the input tensor.
2396   //  labels_data: the device memory region that contains the labels_value
2397   //    tensor.
2398   //  labels_lengths_data: the device memory region that contains the
2399   //    labels_lengths tensor
2400   //  input_lengths_data: the device memory region that contains the seq_lengths
2401   //    tensor
2402   //  costs_data: the device memory region that contains the costs tensor.
2403   //  grads_desc: specifies the shape and the data layout of the grads tensor.
2404   //  grads_data: the device memory region that contains the grads tensor.
2405   //  ctc_loss_desc: a CTCLoss descriptor.
2406   //  workspace_allocator: a memory allocator that creates the temporary
2407   //    workspace memory used by this operation. The caller is responsible for
2408   //    keeping the memory alive long enough for this operation, and recylces
2409   //    afterwards.
2410   virtual port::Status DoCtcLoss(
2411       Stream* stream, dnn::DataType element_type,
2412       const RnnStateTensorDescriptor& probs_desc,
2413       const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
2414       absl::Span<const int> labels_lengths_data,
2415       absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
2416       const RnnStateTensorDescriptor& grads_desc, DeviceMemoryBase grads_data,
2417       DeviceMemory<uint8> scratch_memory, int ctc_loss_algo_id);
2418 
2419   template <typename ElementType>
DoCtcLoss(Stream * stream,const dnn::RnnStateTensorDescriptor & probs_desc,const DeviceMemory<ElementType> & probs_data,absl::Span<const int> labels_data,absl::Span<const int> labels_lengths_data,absl::Span<const int> input_lengths_data,DeviceMemory<ElementType> * costs_data,const dnn::RnnStateTensorDescriptor & grads_desc,DeviceMemory<ElementType> * grads_data,DeviceMemory<uint8> * scratch_memory,int ctc_loss_algo_id)2420   bool DoCtcLoss(Stream* stream,
2421                  const dnn::RnnStateTensorDescriptor& probs_desc,
2422                  const DeviceMemory<ElementType>& probs_data,
2423                  absl::Span<const int> labels_data,
2424                  absl::Span<const int> labels_lengths_data,
2425                  absl::Span<const int> input_lengths_data,
2426                  DeviceMemory<ElementType>* costs_data,
2427                  const dnn::RnnStateTensorDescriptor& grads_desc,
2428                  DeviceMemory<ElementType>* grads_data,
2429                  DeviceMemory<uint8>* scratch_memory, int ctc_loss_algo_id) {
2430     return IsStatusOk(
2431         DoCtcLoss(stream, ToDataType<ElementType>::value, probs_desc,
2432                   probs_data, labels_data, labels_lengths_data,
2433                   input_lengths_data, *costs_data, grads_desc, *grads_data,
2434                   *scratch_memory, ctc_loss_algo_id),
2435         false);
2436   }
2437 
2438   // Transforms a tensor into another tensor with a different layout and/or data
2439   // type.
2440   //
2441   // Arguments:
2442   //  stream: pointer to the stream where this operation should be enqueued to.
2443   //  input_desc: specifies the shape and the data layout of the input tensor.
2444   //  input_type: the data type of the input tensor.
2445   //  input_data: the device memory region that contains the input tensor.
2446   //  output_desc: specifies the shape and the data layout of the output tensor.
2447   //  output_type: the data type of the output tensor.
2448   //  scale: an element-wise scaling factor to apply.
2449   //  output_data: the device memory region that contains the output tensor.
DoTransformTensor(Stream * stream,const dnn::BatchDescriptor & input_desc,dnn::DataType input_type,const DeviceMemoryBase & input_data,const dnn::BatchDescriptor & output_desc,dnn::DataType output_type,float scale,DeviceMemoryBase * output_data)2450   virtual bool DoTransformTensor(Stream* stream,
2451                                  const dnn::BatchDescriptor& input_desc,
2452                                  dnn::DataType input_type,
2453                                  const DeviceMemoryBase& input_data,
2454                                  const dnn::BatchDescriptor& output_desc,
2455                                  dnn::DataType output_type, float scale,
2456                                  DeviceMemoryBase* output_data) {
2457     return false;
2458   }
2459 
2460   // Enqueues a fused convolution+bias+activation operation onto the stream.
2461   //
2462   // Arguments (all borrowed):
2463   //
2464   //  stream: borrowed pointer to the stream that the 'fusion' operation should
2465   //  be enqueued onto.
2466   //
2467   //  conv_input_descriptor: dimensions of the convolution input layer.
2468   //  conv_input_data: device memory which contains the convolution input.
2469   //
2470   //  filter_descriptor: dimensions of the convolution filter.
2471   //  filter_data: device memory which contains the convolution filter weights.
2472   //
2473   //  convolution_descriptor: stride of the convolution filter.
2474   //
2475   //  bias_descriptor: dimensions of the bias layer
2476   //  biases: device memory region containing biases to add to the convolution
2477   //  output
2478   //
2479   //  activation_mode: Type of activation to perform.
2480   //
2481   //  output_descriptor: dimensions of the output layer.
2482   //  output_data: device memory region in which to place the fusion result.
2483   //
2484   //  output_profile_result: the output profile result for this call.
2485   //         The profiling is only enabled when this is not nullptr.
2486   //
DoFusedConvolutionBiasActivation(Stream * stream,const dnn::BatchDescriptor & conv_input_descriptor,const DeviceMemory<float> & conv_input_data,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<float> & filter_data,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & bias_descriptor,const DeviceMemory<float> & bias_data,dnn::ActivationMode activation_mode,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<float> * output_data,dnn::ProfileResult * output_profile_result)2487   virtual bool DoFusedConvolutionBiasActivation(
2488       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
2489       const DeviceMemory<float>& conv_input_data,
2490       const dnn::FilterDescriptor& filter_descriptor,
2491       const DeviceMemory<float>& filter_data,
2492       const dnn::ConvolutionDescriptor& convolution_descriptor,
2493       const dnn::BatchDescriptor& bias_descriptor,
2494       const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
2495       const dnn::BatchDescriptor& output_descriptor,
2496       DeviceMemory<float>* output_data,
2497       dnn::ProfileResult* output_profile_result) {
2498     return false;
2499   }
2500 
2501   // Enqueues a fused batchnorm+activation (inference) operation onto the
2502   // stream.
2503   //
2504   // Arguments (all borrowed):
2505   //
2506   //  stream: borrowed pointer to the stream that the 'fusion' operation should
2507   //  be enqueued onto.
2508   //
2509   //  x_descriptor: dimensions of the batchnorm input layer.
2510   //  x_data: device memory which contains the batchnorm input.
2511   //
2512   //  scale_offset_mean_variance_descriptor:
2513   //      dimensions of the scale/offset/mean/variance tensor.
2514   //  scale_data: device memory which contains the scale input.
2515   //  offset_data: device memory which contains the offset input.
2516   //  mean_data: device memory which contains the mean input.
2517   //  variance_data: device memory which contains the variance input.
2518   //  epsilon : the epsilon value to use in batchnorm calculation
2519   //
2520   //  activation_mode: Type of activation to perform.
2521   //
2522   //  y_data: device memory region in which to place the fusion result.
2523   //
2524   //  output_profile_result: the output profile result for this call.
2525   //         The profiling is only enabled when this is not nullptr.
2526   //
DoFusedBatchNormActivationInference(Stream * stream,const dnn::BatchDescriptor & x_descriptor,const DeviceMemory<float> & x_data,const dnn::BatchDescriptor & scale_offset_mean_variance_descriptor,const DeviceMemory<float> & scale_data,const DeviceMemory<float> & offset_data,const DeviceMemory<float> & mean_data,const DeviceMemory<float> & variance_data,double epsilon,dnn::ActivationMode activation_mode,DeviceMemory<float> * y_data,dnn::ProfileResult * output_profile_result)2527   virtual bool DoFusedBatchNormActivationInference(
2528       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2529       const DeviceMemory<float>& x_data,
2530       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2531       const DeviceMemory<float>& scale_data,
2532       const DeviceMemory<float>& offset_data,
2533       const DeviceMemory<float>& mean_data,
2534       const DeviceMemory<float>& variance_data, double epsilon,
2535       dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
2536       dnn::ProfileResult* output_profile_result) {
2537     return false;
2538   }
2539 
DoFusedBatchNormActivationInference(Stream * stream,const dnn::BatchDescriptor & x_descriptor,const DeviceMemory<Eigen::half> & x_data,const dnn::BatchDescriptor & scale_offset_mean_variance_descriptor,const DeviceMemory<float> & scale_data,const DeviceMemory<float> & offset_data,const DeviceMemory<float> & mean_data,const DeviceMemory<float> & variance_data,double epsilon,dnn::ActivationMode activation_mode,DeviceMemory<Eigen::half> * y_data,dnn::ProfileResult * output_profile_result)2540   virtual bool DoFusedBatchNormActivationInference(
2541       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2542       const DeviceMemory<Eigen::half>& x_data,
2543       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2544       const DeviceMemory<float>& scale_data,
2545       const DeviceMemory<float>& offset_data,
2546       const DeviceMemory<float>& mean_data,
2547       const DeviceMemory<float>& variance_data, double epsilon,
2548       dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
2549       dnn::ProfileResult* output_profile_result) {
2550     return false;
2551   }
2552 
2553   // Enqueues a fused batchnorm+activation (training-fwd) operation onto the
2554   // stream.
2555   //
2556   // Arguments (all borrowed):
2557   //
2558   //  stream: borrowed pointer to the stream that the 'fusion' operation should
2559   //  be enqueued onto.
2560   //
2561   //  x_descriptor: dimensions of the batchnorm input layer.
2562   //  x_data: device memory which contains the batchnorm input.
2563   //
2564   //  scale_offset_mean_variance_descriptor:
2565   //      dimensions of the scale/offset/mean/variance tensor.
2566   //  scale_data: device memory which contains the scale input.
2567   //  offset_data: device memory which contains the offset input.
2568   //  epsilon : the epsilon value to use in batchnorm calculation
2569   //
2570   //  activation_mode: Type of activation to perform.
2571   //
2572   //  y_data: device memory region in which to place the fusion result.
2573   //  batch_mean_data: device memory in which to place the batch mean output.
2574   //  batch_var_data: device memory in which to place the batch variance output.
2575   //  saved_mean_data: device memory in which to save the mean for bwd pass.
2576   //  saved_var_data: device memory in which to save the variance for bwd pass.
2577   //
2578   //  output_profile_result: the output profile result for this call.
2579   //         The profiling is only enabled when this is not nullptr.
2580   //
DoFusedBatchNormActivationForward(Stream * stream,const dnn::BatchDescriptor & x_descriptor,const DeviceMemory<float> & x_data,const dnn::BatchDescriptor & scale_offset_mean_variance_descriptor,const DeviceMemory<float> & scale_data,const DeviceMemory<float> & offset_data,double epsilon,dnn::ActivationMode activation_mode,DeviceMemory<float> * y_data,DeviceMemory<float> * batch_mean_data,DeviceMemory<float> * batch_var_data,DeviceMemory<float> * saved_mean_data,DeviceMemory<float> * saved_var_data,dnn::ProfileResult * output_profile_result)2581   virtual bool DoFusedBatchNormActivationForward(
2582       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2583       const DeviceMemory<float>& x_data,
2584       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2585       const DeviceMemory<float>& scale_data,
2586       const DeviceMemory<float>& offset_data, double epsilon,
2587       dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
2588       DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
2589       DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
2590       dnn::ProfileResult* output_profile_result) {
2591     return false;
2592   }
2593 
DoFusedBatchNormActivationForward(Stream * stream,const dnn::BatchDescriptor & x_descriptor,const DeviceMemory<Eigen::half> & x_data,const dnn::BatchDescriptor & scale_offset_mean_variance_descriptor,const DeviceMemory<float> & scale_data,const DeviceMemory<float> & offset_data,double epsilon,dnn::ActivationMode activation_mode,DeviceMemory<Eigen::half> * y_data,DeviceMemory<float> * batch_mean_data,DeviceMemory<float> * batch_var_data,DeviceMemory<float> * saved_mean_data,DeviceMemory<float> * saved_var_data,dnn::ProfileResult * output_profile_result)2594   virtual bool DoFusedBatchNormActivationForward(
2595       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2596       const DeviceMemory<Eigen::half>& x_data,
2597       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2598       const DeviceMemory<float>& scale_data,
2599       const DeviceMemory<float>& offset_data, double epsilon,
2600       dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
2601       DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
2602       DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
2603       dnn::ProfileResult* output_profile_result) {
2604     return false;
2605   }
2606 
2607   // Enqueues a fused batchnorm+activation (training-bwd) operation onto the
2608   // stream.
2609   //
2610   // Arguments (all borrowed):
2611   //
2612   //  stream: borrowed pointer to the stream that the 'fusion' operation should
2613   //  be enqueued onto.
2614   //
2615   //  y_act_backprop_descriptor: dimensions of the backprop input from the
2616   //  previous layer. y_act_backprop_data: device memory which contains the
2617   //  backprop input.
2618   //
2619   //  y_act_data: device memory which contains the actv-fwd output data.
2620   //
2621   //  activation_mode: actv-fwd type.
2622   //
2623   //  scale_offset_mean_variance_descriptor:
2624   //      dimensions of the scale/offset/mean/variance tensor.
2625   //  scale_data: device memory which contains the scale input.
2626   //  offset_data: device memory which contains the offset input.
2627   //  saved_mean_data: device memory which contains the saved mean from fwd
2628   //  pass. saved_var_data: device memory which contains the saved variance from
2629   //  fwd pass.
2630   //
2631   //  x_bn_backprop_data: device memory region in which to place the backprop
2632   //  data from this layer scale_backprop_data: device memory in which to place
2633   //  the scale backprop output. offset_backprop_data: device memory in which to
2634   //  place the offset backprop output.
2635   //
2636   //  output_profile_result: the output profile result for this call.
2637   //         The profiling is only enabled when this is not nullptr.
2638   //
DoFusedBatchNormActivationBackward(Stream * stream,const dnn::BatchDescriptor & y_act_backprop_descriptor,const DeviceMemory<float> & y_act_backprop_data,const DeviceMemory<float> & y_act_data,dnn::ActivationMode activation_mode,const DeviceMemory<float> & x_bn_data,const dnn::BatchDescriptor & scale_offset_mean_variance_descriptor,const DeviceMemory<float> & scale_data,const DeviceMemory<float> & offset_data,const DeviceMemory<float> & saved_mean_data,const DeviceMemory<float> & saved_var_data,DeviceMemory<float> * x_bn_backprop_data,DeviceMemory<float> * scale_backprop_data,DeviceMemory<float> * offset_backprop_data,dnn::ProfileResult * output_profile_result)2639   virtual bool DoFusedBatchNormActivationBackward(
2640       Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
2641       const DeviceMemory<float>& y_act_backprop_data,
2642       const DeviceMemory<float>& y_act_data,
2643       dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data,
2644       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2645       const DeviceMemory<float>& scale_data,
2646       const DeviceMemory<float>& offset_data,
2647       const DeviceMemory<float>& saved_mean_data,
2648       const DeviceMemory<float>& saved_var_data,
2649       DeviceMemory<float>* x_bn_backprop_data,
2650       DeviceMemory<float>* scale_backprop_data,
2651       DeviceMemory<float>* offset_backprop_data,
2652       dnn::ProfileResult* output_profile_result) {
2653     return false;
2654   }
2655 
DoFusedBatchNormActivationBackward(Stream * stream,const dnn::BatchDescriptor & y_act_backprop_descriptor,const DeviceMemory<Eigen::half> & y_act_backprop_data,const DeviceMemory<Eigen::half> & y_act_data,dnn::ActivationMode activation_mode,const DeviceMemory<Eigen::half> & x_bn_data,const dnn::BatchDescriptor & scale_offset_mean_variance_descriptor,const DeviceMemory<float> & scale_data,const DeviceMemory<float> & offset_data,const DeviceMemory<float> & saved_mean_data,const DeviceMemory<float> & saved_var_data,DeviceMemory<Eigen::half> * x_bn_backprop_data,DeviceMemory<float> * scale_backprop_data,DeviceMemory<float> * offset_backprop_data,dnn::ProfileResult * output_profile_result)2656   virtual bool DoFusedBatchNormActivationBackward(
2657       Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
2658       const DeviceMemory<Eigen::half>& y_act_backprop_data,
2659       const DeviceMemory<Eigen::half>& y_act_data,
2660       dnn::ActivationMode activation_mode,
2661       const DeviceMemory<Eigen::half>& x_bn_data,
2662       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2663       const DeviceMemory<float>& scale_data,
2664       const DeviceMemory<float>& offset_data,
2665       const DeviceMemory<float>& saved_mean_data,
2666       const DeviceMemory<float>& saved_var_data,
2667       DeviceMemory<Eigen::half>* x_bn_backprop_data,
2668       DeviceMemory<float>* scale_backprop_data,
2669       DeviceMemory<float>* offset_backprop_data,
2670       dnn::ProfileResult* output_profile_result) {
2671     return false;
2672   }
2673 
2674  protected:
2675   // Returns whether status is 'ok', and potentially logs the error.
2676   static bool IsStatusOk(const port::Status& status, bool report_error);
2677 
2678  private:
DoPrepareForConvolution(ConvolutionKind kind,DataType element_type,Stream * stream,const BatchDescriptor & batch_descriptor,DeviceMemoryBase input_data,const FilterDescriptor & filter_descriptor,DeviceMemoryBase filter_data,const BatchDescriptor & output_descriptor,DeviceMemoryBase output_data,const ConvolutionDescriptor & convolution_descriptor,const AlgorithmConfig & algorithm_config,ScratchAllocator * scratch_allocator,AlgorithmDesc * algorithm_desc,DeviceMemory<uint8> * scratch_memory)2679   virtual port::Status DoPrepareForConvolution(
2680       ConvolutionKind kind, DataType element_type, Stream* stream,
2681       const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data,
2682       const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
2683       const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
2684       const ConvolutionDescriptor& convolution_descriptor,
2685       const AlgorithmConfig& algorithm_config,
2686       ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
2687       DeviceMemory<uint8>* scratch_memory) {
2688     *algorithm_desc = {};
2689     *scratch_memory = {};
2690     return port::Status::OK();
2691   }
2692 
DoPrepareForCtcLoss(Stream * stream,DataType element_type,const RnnStateTensorDescriptor & probs_desc,const RnnStateTensorDescriptor & grads_desc,absl::Span<const int> labels_data,absl::Span<const int> labels_lengths_data,absl::Span<const int> input_lengths_data,ScratchAllocator * scratch_allocator,DeviceMemory<uint8> * scratch_memory,int * ctc_loss_algo_id)2693   virtual port::Status DoPrepareForCtcLoss(
2694       Stream* stream, DataType element_type,
2695       const RnnStateTensorDescriptor& probs_desc,
2696       const RnnStateTensorDescriptor& grads_desc,
2697       absl::Span<const int> labels_data,
2698       absl::Span<const int> labels_lengths_data,
2699       absl::Span<const int> input_lengths_data,
2700       ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch_memory,
2701       int* ctc_loss_algo_id) {
2702     *scratch_memory = {};
2703     return port::Status::OK();
2704   }
2705 
2706   SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
2707 };
2708 
2709 }  // namespace dnn
2710 }  // namespace stream_executor
2711 
2712 #endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
2713