1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "include/libxsmm.h"
17 #include "tensorflow/core/framework/fake_input.h"
18 #include "tensorflow/core/graph/graph.h"
19 #include "tensorflow/core/graph/node_builder.h"
20 #include "tensorflow/core/kernels/conv_ops.h"
21 #include "tensorflow/core/kernels/ops_testutil.h"
22 #include "tensorflow/core/platform/test.h"
23 
24 namespace tensorflow {
25 namespace {
26 
27 typedef struct {
28   int nImg;
29   int nIfm;
30   int nOfm;
31   int ifhp;
32   int ifwp;
33   int ifh;
34   int ifw;
35   int ofhp;
36   int ofwp;
37   int ofh;
38   int ofw;
39   int pad_h;
40   int pad_w;
41   int pad_h_in;
42   int pad_w_in;
43   int pad_h_out;
44   int pad_w_out;
45   int kh;
46   int kw;
47   int stride_h;
48   int stride_w;
49 } naive_conv_t;
50 
naive_copy_NCHW_to_NHWC(const float * nchw,Tensor & nhwc,int N,int H,int W,int C)51 LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor& nhwc,
52                                             int N, int H, int W, int C) {
53   LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W);
54   int n, h, w, c;
55   auto output = nhwc.flat<float>();
56   for (n = 0; n < N; n++) {
57     for (h = 0; h < H; h++) {
58       for (w = 0; w < W; w++) {
59         for (c = 0; c < C; c++) {
60           output(n * H * W * C + h * W * C + w * C + c) =
61               LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W);
62         }
63       }
64     }
65   }
66 }
67 
naive_copy_KCRS_to_RSCK(const float * kcrs,Tensor & rsck,int R,int S,int C,int K)68 LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor& rsck,
69                                             int R, int S, int C, int K) {
70   LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S);
71   int r, s, c, k;
72   auto output = rsck.flat<float>();
73 
74   for (r = 0; r < R; r++) {
75     for (s = 0; s < S; s++) {
76       for (c = 0; c < C; c++) {
77         for (k = 0; k < K; k++) {
78           output(r * S * C * K + s * C * K + c * K + k) =
79               LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S);
80         }
81       }
82     }
83   }
84 }
85 
zero_buf(float * buf,long size)86 LIBXSMM_INLINE void zero_buf(float* buf, long size) {
87   int i;
88   for (i = 0; i < size; ++i) {
89     buf[i] = 0.0f;
90   }
91 }
92 
copy_buf(Tensor & dst,float * src,long size)93 LIBXSMM_INLINE void copy_buf(Tensor& dst, float* src, long size) {
94   long i;
95   auto output = dst.flat<float>();
96   for (i = 0; i < size; ++i) output(i) = src[i];
97 }
98 
init_buf(float * buf,long size,int initPos,int initOne)99 LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) {
100   int i;
101   zero_buf(buf, size);
102   for (i = 0; i < size; ++i) {
103     buf[i] =
104         (float)((initOne != 0)
105                     ? 1.0
106                     : ((initPos != 0) ? drand48() : (0.05 - drand48() / 10.0)));
107   }
108 }
109 
naive_conv_fp(naive_conv_t * param,const float * input,float * output,const float * filter)110 LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input,
111                                   float* output, const float* filter) {
112   int nImg = param->nImg;
113   int nIfm = param->nIfm;
114   int nOfm = param->nOfm;
115   int ifhp = param->ifhp;
116   int ifwp = param->ifwp;
117   int ofhp = param->ofhp;
118   int ofwp = param->ofwp;
119   int ifh = param->ifh;
120   int ifw = param->ifw;
121   int ofh = param->ofh;
122   int ofw = param->ofw;
123   int pad_h = param->pad_h;
124   int pad_w = param->pad_w;
125   int pad_h_in = param->pad_h_in;
126   int pad_w_in = param->pad_w_in;
127   int pad_h_out = param->pad_h_out;
128   int pad_w_out = param->pad_w_out;
129   int kh = param->kh;
130   int kw = param->kw;
131   int stride_h = param->stride_h;
132   int stride_w = param->stride_w;
133   /* loop counters */
134   int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
135 
136   LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out),
137                    nOfm, ofhp, ofwp);
138   LIBXSMM_VLA_DECL(4, const float, input_t,
139                    input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
140   LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
141 
142   for (img = 0; img < nImg; ++img) {
143     for (ofm = 0; ofm < nOfm; ++ofm) {
144       for (ifm = 0; ifm < nIfm; ++ifm) {
145         for (oj = 0; oj < ofh; ++oj) {
146           ij = oj * stride_h - pad_h;
147           for (oi = 0; oi < ofw; ++oi) {
148             ii = oi * stride_w - pad_w;
149             for (kj = 0; kj < kh; ++kj) {
150               if (ij + kj < 0 || ij + kj >= ifh) continue;
151               for (ki = 0; ki < kw; ++ki) {
152                 if (ii + ki < 0 || ii + ki >= ifw) continue;
153                 LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp,
154                                    ofwp) +=
155                     LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki,
156                                        nIfm, ifhp, ifwp) *
157                     LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh,
158                                        kw);
159               }
160             }
161           }
162         }
163       }
164     }
165   }
166 }
167 
RunXsmmVsGeneric()168 void RunXsmmVsGeneric() {}
169 
170 class XsmmConv2DTest : public OpsTestBase {
171  protected:
MakeOp(int stride)172   void MakeOp(int stride) {
173     TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
174                     .Input(FakeInput(DT_FLOAT))
175                     .Input(FakeInput(DT_FLOAT))
176                     .Attr("strides", {1, stride, stride, 1})
177                     .Attr("padding", "VALID")
178                     .Finalize(node_def()));
179 
180     TF_ASSERT_OK(InitOp());
181   }
182 };
183 
TEST_F(XsmmConv2DTest,Basic)184 TEST_F(XsmmConv2DTest, Basic) {
185   MakeOp(1);
186 
187   // setup scoped allocator, which uses cpu_allocator() for this scope
188   const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
189 
190   int ifw = 14;   /* input width, "W" */
191   int ifh = 14;   /* input height, "H" */
192   int nImg = 32;  /* mini-batch size, "N" */
193   int nIfm = 64;  /* number of input feature maps, "C" */
194   int nOfm = 64;  /* number of output feature maps, "K" */
195   int kh = 3;     /* filter height, "R" */
196   int kw = 3;     /* filter width, "S" */
197   int pad = 0;    /* padding in output */
198   int stride = 1; /* stride when accessing inputs */
199 
200   int stride_w = stride;
201   int stride_h = stride;
202   int pad_h = pad;
203   int pad_w = pad;
204 
205   int pad_h_in = pad_h;
206   int pad_w_in = pad_w;
207 
208   int pad_h_out = 0;
209   int pad_w_out = 0;
210 
211   /* deriving some values for naive code */
212   int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
213   int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
214   int ifhp = ifh + 2 * pad_h_in;
215   int ifwp = ifw + 2 * pad_w_in;
216   int ofhp = ofh + 2 * pad_h_out;
217   int ofwp = ofw + 2 * pad_w_out;
218 
219   // Initialization of Filter and Image
220 
221   /* allocate data */
222   float* naive_input = (float*)libxsmm_aligned_scratch(
223       nImg * nIfm * ifhp * ifwp * sizeof(float), 2097152);
224   float* naive_output = (float*)libxsmm_aligned_scratch(
225       nImg * nOfm * ofhp * ofwp * sizeof(float), 2097152);
226   float* naive_filter = (float*)libxsmm_aligned_scratch(
227       nOfm * nIfm * kh * kw * sizeof(float), 2097152);
228   /* initialize data */
229   init_buf(naive_input, nImg * nIfm * ifhp * ifwp, 0, 0);
230   zero_buf(naive_output, nImg * nOfm * ofhp * ofwp);
231   init_buf(naive_filter, nOfm * nIfm * kh * kw, 0, 0);
232 
233   Tensor image(DT_FLOAT, {nImg, ifhp, ifwp, nIfm});
234 
235   Tensor filter(DT_FLOAT, {kh, kw, nIfm, nOfm});
236 
237   naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
238   naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
239 
240   // Run naive convolution
241 
242   naive_conv_t naive_param;
243 
244   naive_param.nImg = nImg;
245   naive_param.nIfm = nIfm;
246   naive_param.nOfm = nOfm;
247   naive_param.ifhp = ifhp;
248   naive_param.ifwp = ifwp;
249   naive_param.ofhp = ofhp;
250   naive_param.ofwp = ofwp;
251   naive_param.ifh = ifh;
252   naive_param.ifw = ifw;
253   naive_param.ofh = ofh;
254   naive_param.ofw = ofw;
255   naive_param.pad_h = pad_h;
256   naive_param.pad_w = pad_w;
257   naive_param.pad_h_in = pad_h_in;
258   naive_param.pad_w_in = pad_w_in;
259   naive_param.pad_h_out = pad_h_out;
260   naive_param.pad_w_out = pad_w_out;
261   naive_param.kh = kh;
262   naive_param.kw = kw;
263   naive_param.stride_h = stride_h;
264   naive_param.stride_w = stride_w;
265 
266   naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
267 
268   AddInputFromArray<float>(image.shape(), image.flat<float>());
269   AddInputFromArray<float>(filter.shape(), filter.flat<float>());
270 
271   // Run Op (TF)
272   TF_ASSERT_OK(RunOpKernel());
273 
274   // Check the output.
275   Tensor expected(DT_FLOAT, {nImg, ofhp, ofwp, nOfm});
276   naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
277 
278   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
279   libxsmm_free(naive_input);
280   libxsmm_free(naive_output);
281   libxsmm_free(naive_filter);
282 }
283 
284 /*
285 
286 
287 TEST(XsmmConv2DTest, Basic) {
288 
289     auto num_threads =
290         ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
291     // See libxsmm_dnn.h for this struct definition.
292     libxsmm_dnn_conv_desc desc;
293     desc.N = batch;
294     desc.C = in_depth;
295     desc.H = input_rows;
296     desc.W = input_cols;
297     desc.K = out_depth;
298     desc.R = filter_rows;
299     desc.S = filter_cols;
300     desc.u = stride_rows;
301     desc.v = stride_cols;
302     desc.pad_h = pad_rows;
303     desc.pad_w = pad_cols;
304     desc.pad_h_in = pad_rows;  // libxsmm supports only physical padding for now
305     desc.pad_w_in = pad_cols;  // libxsmm supports only physical padding for now
306     desc.pad_h_out = 0;
307     desc.pad_w_out = 0;
308     desc.threads = num_threads;
309     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
310     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
311     desc.filter_format =
312 LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
313     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
314     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
315     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
316 
317     if (!CanUseXsmmConv2D(desc, data_format)) {
318       return false;
319     }
320 
321     auto input_ptr = input.template flat<float>().data();
322     auto filter_ptr = filter.template flat<float>().data();
323     auto output_ptr = output->template flat<float>().data();
324 
325     bool success = functor::XsmmFwdConv2D<CPUDevice, float>()(
326         ctx, desc, input_ptr, filter_ptr, output_ptr);
327     return success;
328 
329 
330 
331 
332 
333 
334 
335 }
336 */
337 }  // namespace
338 }  // namespace tensorflow
339