1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Make this file empty (or nearly empty) so that it can be compiled even when
17 // libxsmm is not available.
18 
19 #ifndef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
20 void dummy_xsmm_conv2d_ensure_file_is_not_empty();
21 #else
22 
23 #define USE_EIGEN_TENSOR
24 #define EIGEN_USE_THREADS
25 
26 #include "tensorflow/core/kernels/xsmm_conv2d.h"
27 
28 #include <stdlib.h>
29 #include <cstring>
30 
31 #include "tensorflow/core/framework/op_kernel.h"
32 #include "tensorflow/core/lib/core/blocking_counter.h"
33 #include "tensorflow/core/lib/core/threadpool.h"
34 
35 #include "include/libxsmm_cpuid.h"
36 #include "include/libxsmm_malloc.h"
37 #include "third_party/libxsmm/src/libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
38 
39 namespace tensorflow {
40 
41 // Xsmm*Conv2D are wrappers for libxsmm direct convolutions.
42 
43 // Returns true if convolution can be computed efficiently by XsmmConv2D,
44 // returns false otherwise.
CanUseXsmmConv2D(const libxsmm_dnn_conv_desc & desc,TensorFormat data_format)45 bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
46                       TensorFormat data_format) {
47   int VECTOR_SIZE;
48   int arch = libxsmm_cpuid_x86();
49 
50   if (arch == LIBXSMM_X86_AVX512_CORE) {
51     VECTOR_SIZE = 16;
52   } else if (arch == LIBXSMM_X86_AVX2) {
53     VECTOR_SIZE = 8;
54   } else {
55     VLOG(1) << "Cannot use XSMM convolutions: unsupported architecture!";
56     return false;
57   }
58 
59   if (data_format != FORMAT_NHWC) {
60     VLOG(1) << "Cannot use XSMM convolutions: unsupported format!";
61     return false;
62   }
63   if (desc.K % VECTOR_SIZE != 0) {
64     VLOG(1) << "Cannot use XSMM convolutions: output features count not"
65                " divisible by vector size!";
66     return false;
67   }
68   VLOG(2) << "Can use XSMM convolutions.";
69   return true;
70 }
71 
72 typedef Eigen::ThreadPoolDevice CPUDevice;
73 
74 namespace functor {
75 
chk_libxsmm_err(libxsmm_dnn_err_t status,string msg)76 static void chk_libxsmm_err(libxsmm_dnn_err_t status, string msg) {
77   if (status != LIBXSMM_DNN_SUCCESS) {
78     VLOG(0) << msg << " failed: " << libxsmm_dnn_get_error(status);
79   }
80 }
81 
copy_RSCK_to_custom(const float * rsck,float * kcrs,int R,int S,int C,int K,int blocksifm,int blocksofm,int ifmblock,int ofmblock,int start,int end)82 LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float* kcrs, int R,
83                                         int S, int C, int K, int blocksifm,
84                                         int blocksofm, int ifmblock,
85                                         int ofmblock, int start, int end) {
86   LIBXSMM_VLA_DECL(4, const float, input, rsck, S, C, K);
87   LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm, R, S, ifmblock, ofmblock);
88   int r, s, k, c, v1, v2;
89 
90   for (k = start; k < end; k++) {
91     for (c = 0; c < blocksifm; c++) {
92       for (r = 0; r < R; r++) {
93         for (s = 0; s < S; s++) {
94           for (v1 = c * ifmblock; v1 < std::min(C, (c + 1) * ifmblock); v1++) {
95             for (v2 = k * ofmblock; v2 < std::min(K, (k + 1) * ofmblock); v2++)
96               LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
97                                  v2 - k * ofmblock, blocksifm, R, S, ifmblock,
98                                  ofmblock) =
99                   LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2, S, C, K);
100             for (v2 = K; v2 < (k + 1) * ofmblock; v2++)
101               LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
102                                  v2 - k * ofmblock, blocksifm, R, S, ifmblock,
103                                  ofmblock) = 0.0f;
104           }
105           for (v1 = C; v1 < (c + 1) * ifmblock; v1++) {
106             for (v2 = k * ofmblock; v2 < (k + 1) * ofmblock; v2++)
107               LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
108                                  v2 - k * ofmblock, blocksifm, R, S, ifmblock,
109                                  ofmblock) = 0.0f;
110           }
111         }
112       }
113     }
114   }
115 }
116 
117 class libxsmm_dnn_conv_desc_wrap {
118  public:
119   const libxsmm_dnn_conv_desc d;
120 
libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc & d_)121   libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc& d_) : d(d_) {}
operator ==(const libxsmm_dnn_conv_desc_wrap & w) const122   bool operator==(const libxsmm_dnn_conv_desc_wrap& w) const {
123     return (d.N == w.d.N && d.C == w.d.C && d.H == w.d.H && d.W == w.d.W &&
124             d.K == w.d.K && d.R == w.d.R && d.S == w.d.S && d.u == w.d.u &&
125             d.v == w.d.v && d.pad_h == w.d.pad_h && d.pad_w == w.d.pad_w);
126   }
127 };
128 
129 struct HashFunction {
operator ()tensorflow::functor::HashFunction130   std::size_t operator()(const libxsmm_dnn_conv_desc_wrap& w) const {
131     return libxsmm_hash(&w.d, sizeof(w.d), 25071975);
132   }
133 };
134 
135 class handles {
136  public:
find(const libxsmm_dnn_conv_desc_wrap & w)137   libxsmm_dnn_layer* find(const libxsmm_dnn_conv_desc_wrap& w) {
138     std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
139                        HashFunction>::iterator i = libxsmm_handles.find(w);
140     if (i == libxsmm_handles.end()) {
141       libxsmm_dnn_err_t status;
142       libxsmm_dnn_layer* libxsmm_handle =
143           libxsmm_dnn_create_conv_layer(w.d, &status);
144       chk_libxsmm_err(status, "Create handle");
145       libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
146       return libxsmm_handle;
147     } else {
148       return i->second;
149     }
150   }
~handles()151   ~handles() {
152     std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
153                        HashFunction>::iterator i;
154     for (i = libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
155       chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
156                       "Destroy handle");
157   }
158 
159  private:
160   std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
161                      HashFunction>
162       libxsmm_handles;
163 };
164 
165 static handles libxsmm_handles;
166 
167 // #define LIBXSMM_DETAILED_TIMING
168 
169 template <typename InputPtr, typename FilterPtr, typename OutputPtr>
CallLibxsmmConvGeneric(OpKernelContext * ctx,const libxsmm_dnn_conv_desc & desc,libxsmm_dnn_compute_kind kind,InputPtr input,FilterPtr filter,OutputPtr output)170 static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
171                                    const libxsmm_dnn_conv_desc& desc,
172                                    libxsmm_dnn_compute_kind kind,
173                                    InputPtr input, FilterPtr filter,
174                                    OutputPtr output) {
175 #if defined(LIBXSMM_DETAILED_TIMING)
176   uint64 l_tick1;
177   uint64 l_tick2;
178   uint64 l_tick3;
179   uint64 l_tick4;
180   uint64 l_tick5;
181   uint64 l_tick6;
182   uint64 l_tick7;
183   uint64 l_tick8;
184   uint64 l_tick9;
185   uint64 l_tick10;
186   l_tick1 = libxsmm_timer_tick();
187 #endif
188   // setup scoped allocator, which adopts the allocator from the context
189   const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator(*ctx);
190   libxsmm_dnn_err_t status;
191   libxsmm_dnn_layer* libxsmm_handle;
192   libxsmm_dnn_conv_desc_wrap w(desc);
193   void* scratch;
194 
195   // if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
196   libxsmm_handle = libxsmm_handles.find(w);
197   // else{
198   //  libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
199   //  chk_libxsmm_err(status, "Create handle");
200   //}
201 
202   status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
203   if (status == LIBXSMM_DNN_WARN_FALLBACK) {
204     return false;  // Use non-libxsmm code
205   }
206   chk_libxsmm_err(status, "Check codegen status");
207 
208   libxsmm_dnn_buffer* libxsmm_input;
209   libxsmm_dnn_buffer* libxsmm_output;
210   libxsmm_dnn_filter* libxsmm_filter;
211 
212 #if defined(LIBXSMM_DETAILED_TIMING)
213   l_tick2 = libxsmm_timer_tick();
214 #endif
215 
216   int ifmblock = (libxsmm_handle->ifmblock);
217   int ofmblock = (libxsmm_handle->ofmblock);
218 
219   int blocksifm =
220       desc.C % ifmblock == 0 ? desc.C / ifmblock : desc.C / ifmblock + 1;
221   int blocksofm =
222       desc.K % ofmblock == 0 ? desc.K / ofmblock : desc.K / ofmblock + 1;
223   float* native_filter =
224       (float*)libxsmm_aligned_scratch(blocksofm * blocksifm * desc.R * desc.S *
225                                           ifmblock * ofmblock * sizeof(float),
226                                       2097152);
227 
228   const DeviceBase::CpuWorkerThreads* worker_threads =
229       ctx->device()->tensorflow_cpu_worker_threads();
230 
231   int num_threads = worker_threads->num_threads;
232 
233 #if 1
234   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
235       kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
236     if (blocksofm > num_threads) {
237       int work = blocksofm;
238       BlockingCounter count(num_threads);
239       for (int i = 0; i < num_threads; ++i) {
240         worker_threads->workers->Schedule([=, &count]() {
241           int start = work / num_threads * i;
242           int end = (start + work / num_threads) > work
243                         ? work
244                         : start + work / num_threads;
245           copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
246                               desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
247                               start, end);
248           count.DecrementCount();
249         });
250       }
251       count.Wait();
252     } else {
253       int work = blocksofm;
254       int num_threads = work;
255 
256       BlockingCounter count(num_threads);
257       for (int i = 0; i < num_threads; ++i) {
258         worker_threads->workers->Schedule([=, &count]() {
259           int start = i;
260           int end = i + 1;
261           copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
262                               desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
263                               start, end);
264           count.DecrementCount();
265         });
266       }
267       count.Wait();
268     }
269   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
270     // Added: for weight update
271     libxsmm_filter =
272         libxsmm_dnn_link_filter(libxsmm_handle, LIBXSMM_DNN_FILTER, filter,
273                                 LIBXSMM_DNN_TENSOR_FORMAT_RSCK_PTR, &status);
274     chk_libxsmm_err(status,
275                     "Link filter");  // weight update is in RSCK as
276                                      // filter should be returned in RSCK
277                                      // format
278   }
279 #else
280   memset(native_filter, 0,
281          blocksofm * blocksifm * desc.R * desc.S * ifmblock * ofmblock *
282              sizeof(float));
283 #endif
284 
285 #if defined(LIBXSMM_DETAILED_TIMING)
286   l_tick3 = libxsmm_timer_tick();
287 #endif
288 
289   libxsmm_input =
290       libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_INPUT, input,
291                               LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
292   chk_libxsmm_err(status, "Link input buffer");
293   libxsmm_output =
294       libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_OUTPUT, output,
295                               LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
296   chk_libxsmm_err(status, "Link output buffer");
297   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
298       kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
299     libxsmm_filter = libxsmm_dnn_link_filter(
300         libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter,
301         LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
302     chk_libxsmm_err(status, "Link filter");
303   }
304   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
305     chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
306                                             LIBXSMM_DNN_REGULAR_INPUT),
307                     "Bind input forward");
308     chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
309                                             LIBXSMM_DNN_REGULAR_OUTPUT),
310                     "Bind output forward");
311     chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
312                                             LIBXSMM_DNN_REGULAR_FILTER),
313                     "Bind filter forward");
314   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
315     chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_input), "Zero input");
316 
317     chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
318                                             LIBXSMM_DNN_GRADIENT_INPUT),
319                     "Bind input backward");
320     chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
321                                             LIBXSMM_DNN_GRADIENT_OUTPUT),
322                     "Bind output backward");
323     chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
324                                             LIBXSMM_DNN_REGULAR_FILTER),
325                     "Bind filter backward");
326   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
327     chk_libxsmm_err(libxsmm_dnn_zero_filter(libxsmm_filter), "Zero filter");
328 
329     chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
330                                             LIBXSMM_DNN_REGULAR_INPUT),
331                     "Bind input weight update");
332     chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
333                                             LIBXSMM_DNN_GRADIENT_OUTPUT),
334                     "Bind output weight update");
335     chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
336                                             LIBXSMM_DNN_GRADIENT_FILTER),
337                     "Bind filter weight update");
338   } else {
339     /* shouldn't happen */
340   }
341 
342 #if defined(LIBXSMM_DETAILED_TIMING)
343   l_tick4 = libxsmm_timer_tick();
344 #endif
345 
346   /* bind scratch */
347   scratch = (void*)libxsmm_aligned_scratch(
348       libxsmm_dnn_get_scratch_size(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL,
349                                    &status),
350       2097152);
351   chk_libxsmm_err(status, "scratch allocation");
352   chk_libxsmm_err(libxsmm_dnn_bind_scratch(
353                       libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch),
354                   "binding scratch");
355 
356 #if defined(LIBXSMM_DETAILED_TIMING)
357   l_tick5 = libxsmm_timer_tick();
358 #endif
359 
360   if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
361     libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
362   }
363 
364 #if defined(LIBXSMM_DETAILED_TIMING)
365   l_tick6 = libxsmm_timer_tick();
366 #endif
367 
368   BlockingCounter counter(num_threads);
369 
370   for (int i = 0; i < num_threads; ++i) {
371     worker_threads->workers->Schedule([=, &counter]() {
372       chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, i),
373                       "Worker");
374       counter.DecrementCount();
375     });
376   }
377   counter.Wait();
378 
379 #if defined(LIBXSMM_DETAILED_TIMING)
380   l_tick7 = libxsmm_timer_tick();
381 #endif
382 
383   if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
384     libxsmm_dnn_reduce_wu_filters(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER);
385   }
386 
387 #if defined(LIBXSMM_DETAILED_TIMING)
388   l_tick8 = libxsmm_timer_tick();
389 #endif
390 
391   /* clean up */
392   chk_libxsmm_err(
393       libxsmm_dnn_release_scratch(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL),
394       "release scratch");
395   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
396     chk_libxsmm_err(
397         libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
398         "release input");
399     chk_libxsmm_err(
400         libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT),
401         "release output");
402     chk_libxsmm_err(
403         libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
404         "release filter");
405   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
406     chk_libxsmm_err(
407         libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT),
408         "release input");
409     chk_libxsmm_err(
410         libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
411         "release output");
412     chk_libxsmm_err(
413         libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
414         "release filter");
415   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
416     chk_libxsmm_err(
417         libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
418         "release input");
419     chk_libxsmm_err(
420         libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
421         "release output");
422     chk_libxsmm_err(
423         libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER),
424         "release filter");
425   } else {
426     /* shouldn't happen */
427   }
428   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_input), "Destroy input");
429   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
430   chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
431 
432 #if defined(LIBXSMM_DETAILED_TIMING)
433   l_tick9 = libxsmm_timer_tick();
434 #endif
435 
436   // if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
437   // chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
438   //               "Destroy handle");
439 
440   libxsmm_free(native_filter);
441   libxsmm_free(scratch);
442 
443 #if defined(LIBXSMM_DETAILED_TIMING)
444   l_tick10 = libxsmm_timer_tick();
445   printf(
446       "time for convolution (%i, %i, %i, %i, %i): %f, %f, %f, %f, %f, %f, %f, "
447       "%f, %f, %f\n",
448       desc.N, desc.C, desc.K, desc.R, desc.S,
449       libxsmm_timer_duration(l_tick1, l_tick2),
450       libxsmm_timer_duration(l_tick2, l_tick3),
451       libxsmm_timer_duration(l_tick3, l_tick4),
452       libxsmm_timer_duration(l_tick4, l_tick5),
453       libxsmm_timer_duration(l_tick5, l_tick6),
454       libxsmm_timer_duration(l_tick6, l_tick7),
455       libxsmm_timer_duration(l_tick7, l_tick8),
456       libxsmm_timer_duration(l_tick8, l_tick9),
457       libxsmm_timer_duration(l_tick9, l_tick10),
458       libxsmm_timer_duration(l_tick1, l_tick10));
459 #endif
460 
461   return true;  // Succeeded
462 }
463 
464 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
465 template <typename T>
466 struct XsmmFwdConv2D<CPUDevice, T> {
operator ()tensorflow::functor::XsmmFwdConv2D467   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
468                   const T* input, const T* filter, T* output) {
469     return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD,
470                                   input, filter, output);
471   }
472 };
473 #endif
474 
475 #ifdef TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
476 template <typename T>
477 struct XsmmBkwInputConv2D<CPUDevice, T> {
operator ()tensorflow::functor::XsmmBkwInputConv2D478   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
479                   T* input, const T* filter, const T* output) {
480     return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD,
481                                   input, filter, output);
482   }
483 };
484 
485 template <typename T>
486 struct XsmmBkwFilterConv2D<CPUDevice, T> {
operator ()tensorflow::functor::XsmmBkwFilterConv2D487   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
488                   const T* input, T* filter, const T* output) {
489     return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD,
490                                   input, filter, output);
491   }
492 };
493 #endif
494 
495 }  // namespace functor
496 
497 template struct functor::XsmmFwdConv2D<CPUDevice, float>;
498 template struct functor::XsmmBkwInputConv2D<CPUDevice, float>;
499 template struct functor::XsmmBkwFilterConv2D<CPUDevice, float>;
500 
501 }  // namespace tensorflow
502 
503 #endif  // TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
504